From ff7f51bdc40a722c5edf2ffffd098b062fc84f0a Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@a15.clsp.jhu.edu>
Date: Mon, 10 Apr 2017 11:50:56 -0400
Subject: [PATCH 001/213] sid-fix-2017-02-11: [egs,scripts]: Moving SRE10 NNET2
 scripts from sre10/v1/local to sre08/v1/sid/nnet2

---
 egs/lre07/v1/lid/init_full_ubm_from_dnn.sh    |   8 ++
 egs/lre07/v1/lid/train_diag_ubm.sh            |   7 +-
 egs/lre07/v1/lid/train_ivector_extractor.sh   |  36 +++----
 .../v1/lid/train_ivector_extractor_dnn.sh     |  51 +++++----
 egs/sre08/v1/sid/extract_ivectors_dnn.sh      |  43 ++++++--
 egs/sre08/v1/sid/init_full_ubm_from_dnn.sh    |  78 ++++++++++----
 .../dnn => sre08/v1/sid/nnet2}/get_egs2.sh    |   0
 .../dnn => sre08/v1/sid/nnet2}/get_lda.sh     |   6 +-
 .../v1/sid/nnet2}/train_multisplice_accel2.sh |  28 ++---
 egs/sre08/v1/sid/train_diag_ubm.sh            |   5 +-
 egs/sre08/v1/sid/train_ivector_extractor.sh   |  38 ++++---
 .../v1/sid/train_ivector_extractor_dnn.sh     |  87 +++++++++------
 .../v1/local/dnn/run_nnet2_multisplice.sh     |  22 ++--
 egs/sre10/v1/local/dnn/train_dnn.sh           |   3 +-
 egs/sre10/v1/local/plda_scoring.sh            |  17 +--
 egs/sre10/v1/run.sh                           |  37 +++----
 egs/sre10/v2/run.sh                           | 100 +++++++++---------
 src/fgmmbin/fgmm-global-init-from-accs.cc     |  50 +++++----
 src/gmm/full-gmm.cc                           |   4 +-
 src/ivectorbin/ivector-mean.cc                |  14 +--
 20 files changed, 371 insertions(+), 263 deletions(-)
 rename egs/{sre10/v1/local/dnn => sre08/v1/sid/nnet2}/get_egs2.sh (100%)
 rename egs/{sre10/v1/local/dnn => sre08/v1/sid/nnet2}/get_lda.sh (99%)
 rename egs/{sre10/v1/local/dnn => sre08/v1/sid/nnet2}/train_multisplice_accel2.sh (98%)

diff --git a/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh b/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh
index 972348766b5..aeced4fb273 100755
--- a/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh
+++ b/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh
@@ -12,6 +12,7 @@
 nj=40
 cmd="run.pl"
 stage=-2
+cleanup=true
 
 # End configuration section.
 
@@ -77,4 +78,11 @@ $cmd $dir/log/init.log \
   "fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
   $dir/final.ubm || exit 1;
 
+if $cleanup; then
+  echo "$0: removing stats"
+  for g in $(seq $nj); do
+    rm $dir/stats.$g.acc || exit 1
+  done
+fi
+
 exit 0;
diff --git a/egs/lre07/v1/lid/train_diag_ubm.sh b/egs/lre07/v1/lid/train_diag_ubm.sh
index 60f2452f3b7..70325cf529d 100755
--- a/egs/lre07/v1/lid/train_diag_ubm.sh
+++ b/egs/lre07/v1/lid/train_diag_ubm.sh
@@ -49,7 +49,7 @@ if [ $# != 3 ]; then
   echo "  --stage <stage|-2>                               # stage to do partial re-run from."
   echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
   echo "                                                   # limit computation to, for speed"
-  echo " --subsample <n|5>                                 # In main E-M phase, use every n" 
+  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
   echo "                                                   # frames (a speedup)"
   echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
   echo "                                                   # for model initialization"
@@ -129,10 +129,11 @@ for x in `seq 0 $[$num_iters-1]`; do
     $cmd $dir/log/update.$x.log \
       gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
       $dir/$[$x+1].dubm || exit 1;
-    rm $dir/$x.*.acc $dir/$x.dubm
+    $cleanup && rm $dir/$x.*.acc $dir/$x.dubm
   fi
 done
 
-rm $dir/gselect.*.gz
+$cleanup && rm $dir/gselect.*.gz
+
 mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
 exit 0;
diff --git a/egs/lre07/v1/lid/train_ivector_extractor.sh b/egs/lre07/v1/lid/train_ivector_extractor.sh
index 8e238985f99..a73bd67cbc1 100755
--- a/egs/lre07/v1/lid/train_ivector_extractor.sh
+++ b/egs/lre07/v1/lid/train_ivector_extractor.sh
@@ -13,7 +13,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -24,8 +24,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -84,7 +84,7 @@ nj_full=$[$nj*$num_processes]
 sdata=$data/split$nj_full;
 utils/split_data.sh $data $nj_full || exit 1;
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 
 feats="ark,s,cs:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:$sdata/JOB/feats.scp ark:- | add-deltas-sdc ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
@@ -97,7 +97,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -135,27 +135,25 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
     echo "Updating model (pass $x)"
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    $cmd -pe smp $nt $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
 
+$cleanup && rm $dir/post.*.gz
 rm $dir/final.ie 2>/dev/null
 ln -s $x.ie $dir/final.ie
diff --git a/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh b/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
index 7464ce5faea..9f8fc60292b 100755
--- a/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
+++ b/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
@@ -9,16 +9,16 @@
 
 # This script trains the i-vector extractor using a DNN-based UBM. It also requires
 # an fGMM, created by the script lid/init_full_gmm_from_dnn.sh.
-# Note: there are 3 separate levels of parallelization: num_threads, num_processes, 
-# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing 
-# memory usage and disk I/O, subject to various constraints.  The "num_threads" 
+# Note: there are 3 separate levels of parallelization: num_threads, num_processes,
+# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing
+# memory usage and disk I/O, subject to various constraints.  The "num_threads"
 # is how many threads a program uses; the "num_processes" is the number of separate
 # processes a single  job spawns, and then sums the accumulators in memory.
 # Our recommendation:
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -29,8 +29,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -95,9 +95,9 @@ utils/split_data.sh $data $nj_full || exit 1;
 
 sdata_dnn=$data_dnn/split$nj_full;
 utils/split_data.sh $data_dnn $nj_full || exit 1;
-     
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 
 # Set up features.
 
@@ -114,7 +114,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1;
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -153,24 +153,21 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
-	echo "Updating model (pass $x)"
-        nt=$[$num_threads*$num_processes] # use the same number of threads that
-                                          # each accumulation process uses, since we
-                                          # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    echo "Updating model (pass $x)"
+    nt=$[$num_threads*$num_processes] # use the same number of threads that
+                                      # each accumulation process uses, since we
+                                      # can be sure the queue will support this many.
+    $cmd -pe smp $nt $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
diff --git a/egs/sre08/v1/sid/extract_ivectors_dnn.sh b/egs/sre08/v1/sid/extract_ivectors_dnn.sh
index 8692e6ee8a5..2687d1fc6c8 100755
--- a/egs/sre08/v1/sid/extract_ivectors_dnn.sh
+++ b/egs/sre08/v1/sid/extract_ivectors_dnn.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Copyright     2013  Daniel Povey
-#          2014-2015  David Snyder
+#          2014-2017  David Snyder
 #               2015  Johns Hopkins University (Author: Daniel Garcia-Romero)
 #               2015  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -16,6 +16,9 @@ stage=0
 min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
 posterior_scale=1.0 # This scale helps to control for successive features being highly
                     # correlated.  E.g. try 0.1 or 0.3.
+use_gpu=true
+chunk_size=256
+nnet_job_opt=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -37,6 +40,8 @@ if [ $# != 5 ]; then
   echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
   echo "                                                   # diagonal model."
   echo "  --min-post <min-post|0.025>                      # Pruning threshold for posteriors"
+  echo "  --nnet-job-opt <option|''>                       # Options for the DNN jobs which add to or"
+  echo "                                                   # replace those specified by --cmd"
   exit 1;
 fi
 
@@ -46,6 +51,21 @@ data=$3
 data_dnn=$4
 dir=$5
 
+gpu_opt=""
+if $use_gpu; then
+  nnet_job_opt="$nnet_job_opt --gpu 1"
+  gpu_opt="--use-gpu=yes"
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are trying to use the GPU but you have not compiled"
+    echo "   for CUDA.  If you have GPUs and have nvcc installed, go to src/"
+    echo "   and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be slow."
+  gpu_opt="--use-gpu=no"
+fi
+
 for f in $srcdir/final.ie $srcdir/final.ubm $data/feats.scp ; do
   [ ! -f $f ] && echo "No such file $f" && exit 1;
 done
@@ -60,8 +80,6 @@ utils/split_data.sh $data_dnn $nj || exit 1;
 
 delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
-splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options           
-
 ## Set up features.
 feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 
@@ -69,13 +87,18 @@ nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.s
 
 if [ $stage -le 0 ]; then
   echo "$0: extracting iVectors"
-  $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
-    nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
-    \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
-    \| logprob-to-post --min-post=$min_post ark:- ark:- \| \
-    scale-post ark:- $posterior_scale ark:- \| \
-    ivector-extract --verbose=2 $srcdir/final.ie "$feats" ark,s,cs:- \
-      ark,scp,t:$dir/ivector.JOB.ark,$dir/ivector.JOB.scp || exit 1;
+  for g in $(seq $nj); do
+    $cmd $nnet_job_opt $dir/log/extract_ivectors.$g.log \
+      nnet-am-compute $gpu_opt --apply-log=true --chunk-size=${chunk_size} \
+        $nnet "`echo $nnet_feats | sed s/JOB/$g/g`" ark:- \
+        \| select-voiced-frames ark:- scp,s,cs:$sdata/$g/vad.scp ark:- \
+        \| logprob-to-post --min-post=$min_post ark:- ark:- \| \
+        scale-post ark:- $posterior_scale ark:- \| \
+        ivector-extract --verbose=2 $srcdir/final.ie \
+        "`echo $feats | sed s/JOB/$g/g`" ark,s,cs:- \
+        ark,scp,t:$dir/ivector.$g.ark,$dir/ivector.$g.scp || exit 1 &
+  done
+  wait
 fi
 
 if [ $stage -le 1 ]; then
diff --git a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
index f6710028ae5..c6b508a7206 100755
--- a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
+++ b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
@@ -1,18 +1,23 @@
 #!/bin/bash
-# Copyright 2015   David Snyder
-#           2015   Johns Hopkins University (Author: Daniel Garcia-Romero)
-#           2015   Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2015-2017   David Snyder
+#           2015        Johns Hopkins University (Author: Daniel Garcia-Romero)
+#           2015        Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 
 # This script derives a full-covariance UBM from DNN posteriors and
 # speaker recognition features.
 
 # Begin configuration section.
-nj=40
+nj=8
 cmd="run.pl"
 stage=-2
 delta_window=3
 delta_order=2
+use_gpu=true
+nnet_job_opt=
+cleanup=true
+chunk_size=256
+stage=0
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -30,15 +35,34 @@ if [ $# != 4 ]; then
   echo "  --nj <n|16>                                      # number of parallel training jobs"
   echo "  --delta-window <n|3>                             # delta window size"
   echo "  --delta-order <n|2>                              # delta order"
-  echo "                                                   # to be equal to the size of the DNN output layer."
+  echo "  --use-gpu <true/false>                           # Use GPU to extract DNN posteriors"
+  echo "  --chunk-size <n|256>                             # Number of frames processed at a time by the DNN"
+  echo "  --nnet-job-opt <option|''>                       # Options for the DNN jobs which add to or"
+  echo "                                                   # replace those specified by --cmd"
   exit 1;
 fi
 
-data=$1
-data_dnn=$2
+data=$1     # Features for the GMM
+data_dnn=$2 # Features for the DNN
 nnet=$3
 dir=$4
 
+gpu_opt=""
+nnet_job_opt=""
+if $use_gpu; then
+  nnet_job_opt="$nnet_job_opt --gpu 1"
+  gpu_opt="--use-gpu=yes"
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are trying to use the GPU but you have not compiled"
+    echo "   for CUDA.  If you have GPUs and have nvcc installed, go to src/"
+    echo "   and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be slow."
+  gpu_opt="--use-gpu=no"
+fi
+
 
 for f in $data/feats.scp $data/vad.scp ${data_dnn}/feats.scp \
     ${data_dnn}/vad.scp $nnet; do
@@ -69,16 +93,34 @@ select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 # in the ancillary GMM.
 num_components=`grep -oP 'output-dim\ \K[0-9]+' <(nnet-am-info $nnet 2> /dev/null)`
 
-$cmd JOB=1:$nj $logdir/make_stats.JOB.log \
-  nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
-  \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
-  \| logprob-to-post ark:- ark:- \| \
-  fgmm-global-acc-stats-post ark:- $num_components "$feats" \
-  $dir/stats.JOB.acc || exit 1;
+if [ $stage -le 0 ]; then
+  echo "$0: accumulating stats from DNN posteriors and speaker ID features"
+  for g in $(seq $nj); do
+    $cmd $nnet_job_opt $dir/log/make_stats.$g.log \
+    nnet-am-compute $gpu_opt \
+      --chunk-size=${chunk_size} --apply-log=true $nnet \
+      "`echo $nnet_feats | sed s/JOB/$g/g`" \
+      ark:- \
+      \| select-voiced-frames ark:- scp,s,cs:$sdata/$g/vad.scp ark:- \
+      \| logprob-to-post ark:- ark:- \| \
+      fgmm-global-acc-stats-post ark:- $num_components \
+      "`echo $feats | sed s/JOB/$g/g`" \
+      $dir/stats.$g.acc || exit 1 &
+  done
+  wait
+fi
 
-$cmd $dir/log/init.log \
-  fgmm-global-init-from-accs --verbose=2 \
-  "fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
-  $dir/final.ubm || exit 1;
+if [ $stage -le 1 ]; then
+  echo "$0: initializing GMM from stats"
+  $cmd $dir/log/init.log \
+    fgmm-global-init-from-accs --verbose=2 \
+    "fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
+    $dir/final.ubm || exit 1;
+fi
 
-exit 0;
+if $cleanup; then
+  echo "$0: removing stats"
+  for g in $(seq $nj); do
+    rm $dir/stats.$g.acc || exit 1
+  done
+fi
diff --git a/egs/sre10/v1/local/dnn/get_egs2.sh b/egs/sre08/v1/sid/nnet2/get_egs2.sh
similarity index 100%
rename from egs/sre10/v1/local/dnn/get_egs2.sh
rename to egs/sre08/v1/sid/nnet2/get_egs2.sh
diff --git a/egs/sre10/v1/local/dnn/get_lda.sh b/egs/sre08/v1/sid/nnet2/get_lda.sh
similarity index 99%
rename from egs/sre10/v1/local/dnn/get_lda.sh
rename to egs/sre08/v1/sid/nnet2/get_lda.sh
index 253222ff271..89594a20f84 100755
--- a/egs/sre10/v1/local/dnn/get_lda.sh
+++ b/egs/sre08/v1/sid/nnet2/get_lda.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).
 #           2015 David Snyder
 # Apache 2.0.
 #
@@ -108,7 +108,7 @@ N=$[$num_feats/$nj]
 case $feat_type in
   raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
     cp $alidir/{splice_opts,final.mat} $dir || exit 1;
      feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
@@ -144,7 +144,7 @@ fi
 echo $ivector_dim >$dir/ivector_dim
 
 if [ -z "$lda_dim" ]; then
-  spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"  
+  spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"
   lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1;
 fi
 
diff --git a/egs/sre10/v1/local/dnn/train_multisplice_accel2.sh b/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
similarity index 98%
rename from egs/sre10/v1/local/dnn/train_multisplice_accel2.sh
rename to egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
index f5441d6e967..461a213c8ca 100755
--- a/egs/sre10/v1/local/dnn/train_multisplice_accel2.sh
+++ b/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -8,7 +8,7 @@
 # Apache 2.0.
 
 # This is a modified version of train_multisplice_accel2.sh in
-# steps/nnet2/ for speaker recognition. The main difference is
+# ../../steps/nnet2/ for speaker recognition. The main difference is
 # that it uses different get_lda.sh and get_egs2.sh scripts.
 #
 # The original train_multisplice_accel2.sh was a modified version of
@@ -25,11 +25,11 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -78,7 +78,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -92,7 +92,7 @@ transform_dir=     # If supplied, overrides alidir
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -148,7 +148,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -201,7 +201,7 @@ extra_opts+=(--transform-dir $transform_dir)
 
 if [ $stage -le -4 ]; then
   echo "$0: calling get_lda.sh"
-  local/dnn/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
+  sid/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
 fi
 # these files will have been written by get_lda.sh
 feat_dim=$(cat $dir/feat_dim) || exit 1;
@@ -213,7 +213,7 @@ if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
   extra_opts+=(--left-context $nnet_left_context )
   extra_opts+=(--right-context $nnet_right_context )
   echo "$0: calling get_egs2.sh"
-  local/dnn/get_egs2.sh $egs_opts "${extra_opts[@]}" \
+  sid/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" \
       --samples-per-iter $samples_per_iter --stage $get_egs_stage \
       --io-opts "$io_opts" \
       --cmd "$cmd" $egs_opts \
@@ -372,7 +372,7 @@ while [ $x -lt $num_iters ]; do
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -417,7 +417,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -461,7 +461,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -500,7 +500,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
@@ -537,7 +537,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/sre08/v1/sid/train_diag_ubm.sh b/egs/sre08/v1/sid/train_diag_ubm.sh
index 6ff1a9099d9..819a51ba73e 100755
--- a/egs/sre08/v1/sid/train_diag_ubm.sh
+++ b/egs/sre08/v1/sid/train_diag_ubm.sh
@@ -135,10 +135,11 @@ for x in `seq 0 $[$num_iters-1]`; do
     $cmd $dir/log/update.$x.log \
       gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
       $dir/$[$x+1].dubm || exit 1;
-    rm $dir/$x.*.acc $dir/$x.dubm
+    $cleanup && rm $dir/$x.*.acc $dir/$x.dubm
   fi
 done
 
-rm $dir/gselect.*.gz
+$cleanup && rm $dir/gselect.*.gz
+
 mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
 exit 0;
diff --git a/egs/sre08/v1/sid/train_ivector_extractor.sh b/egs/sre08/v1/sid/train_ivector_extractor.sh
index 5d7eb984485..68ba0ca65fd 100755
--- a/egs/sre08/v1/sid/train_ivector_extractor.sh
+++ b/egs/sre08/v1/sid/train_ivector_extractor.sh
@@ -13,7 +13,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -24,8 +24,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -90,7 +90,7 @@ if [ -f $srcdir/delta_opts ]; then
   cp $srcdir/delta_opts $dir/ 2>/dev/null
 fi
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 
@@ -102,7 +102,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -140,26 +140,24 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
     echo "Updating model (pass $x)"
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    $cmd $parallel_opts $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
-
+$cleanup && rm -f $dir/post.*.gz
+rm -f $dir/final.ie
 ln -s $x.ie $dir/final.ie
diff --git a/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh b/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
index 64579735376..c64b83c5a4b 100755
--- a/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
+++ b/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
@@ -1,23 +1,23 @@
 #!/bin/bash
 
 # Copyright 2013  Daniel Povey
-#      2014-2015  David Snyder
+#      2014-2017  David Snyder
 #           2015  Johns Hopkins University (Author: Daniel Garcia-Romero)
 #           2015  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
 
 # This script trains the i-vector extractor using a DNN-based UBM. It also requires
 # an fGMM, usually created by the script sid/init_full_gmm_from_dnn.sh.
-# Note: there are 3 separate levels of parallelization: num_threads, num_processes, 
-# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing 
-# memory usage and disk I/O, subject to various constraints.  The "num_threads" 
+# Note: there are 3 separate levels of parallelization: num_threads, num_processes,
+# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing
+# memory usage and disk I/O, subject to various constraints.  The "num_threads"
 # is how many threads a program uses; the "num_processes" is the number of separate
 # processes a single  job spawns, and then sums the accumulators in memory.
 # Our recommendation:
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -28,12 +28,12 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=5   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
-num_processes=4 # each job runs this many processes, each with --num-threads threads
+num_processes=2 # each job runs this many processes, each with --num-threads threads
 cmd="run.pl"
 stage=-4
 num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select
@@ -46,6 +46,9 @@ cleanup=true
 posterior_scale=1.0 # This scale helps to control for successve features being highly
                     # correlated.  E.g. try 0.1 or 0.3
 sum_accs_opt=
+use_gpu=true
+chunk_size=256
+nnet_job_opt=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -71,6 +74,9 @@ if [ $# != 5 ]; then
   echo "                                                   # diagonal model."
   echo "  --sum-accs-opt <option|''>                       # Option e.g. '-l hostname=a15' to localize"
   echo "                                                   # sum-accs process to nfs server."
+  echo "  --nnet-job-opt <option|''>                       # Options for the DNN jobs which add to or"
+  echo "                                                   # replace those specified by --cmd"
+  echo "  --chunk-size <n|256>                             # Number of frames processed at a time by the DNN"
   exit 1;
 fi
 
@@ -80,6 +86,21 @@ data=$3
 data_dnn=$4
 dir=$5
 
+gpu_opt=""
+if $use_gpu; then
+  nnet_job_opt="$nnet_job_opt --gpu 1"
+  gpu_opt="--use-gpu=yes"
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are trying to use the GPU but you have not compiled"
+    echo "   for CUDA.  If you have GPUs and have nvcc installed, go to src/"
+    echo "   and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be slow."
+  gpu_opt="--use-gpu=no"
+fi
+
 srcdir=$(dirname $fgmm_model)
 
 for f in $fgmm_model $data/feats.scp ; do
@@ -100,9 +121,7 @@ if [ -f $srcdir/delta_opts ]; then
   cp $srcdir/delta_opts $dir/ 2>/dev/null
 fi
 
-splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options           
-
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 
@@ -117,19 +136,24 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1;
-fi 
+fi
 
 # Do Gaussian selection and posterior extraction
 
 if [ $stage -le -1 ]; then
   echo $nj_full > $dir/num_jobs
   echo "$0: doing DNN posterior computation"
-  $cmd JOB=1:$nj_full $dir/log/post.JOB.log \
-  nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
-  \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
-  \| logprob-to-post --min-post=$min_post ark,s,cs:- ark:- \| \
-  scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
-
+  for g in $(seq $nj_full); do
+    $cmd $nnet_job_opt $dir/log/post.$g.log \
+      nnet-am-compute $gpu_opt \
+        --chunk-size=${chunk_size} --apply-log=true $nnet \
+        "`echo $nnet_feats | sed s/JOB/$g/g`" \
+        ark:- \
+        \| select-voiced-frames ark:- scp,s,cs:$sdata/$g/vad.scp ark:- \
+        \| logprob-to-post ark:- ark:- \
+        \| scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.$g.gz" || exit 1 &
+  done
+  wait
 else
   if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
     echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
@@ -156,26 +180,25 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
     echo "Updating model (pass $x)"
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    $cmd $parallel_opts $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
 
+$cleanup && rm -f $dir/post.*.gz
+rm -f $dir/final.ie
 ln -s $x.ie $dir/final.ie
diff --git a/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh b/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
index 684cc8ddfc0..c8dc351536b 100755
--- a/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
+++ b/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
@@ -4,43 +4,39 @@
 # egs/fisher_english/s5/local/online. It has been modified
 # for speaker recognition.
 
-. cmd.sh
-
-
 stage=1
 train_stage=-10
 use_gpu=true
 set -e
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
 
+. utils/parse_options.sh
 
 # assume use_gpu=true since it would be way too slow otherwise.
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1" 
+parallel_opts="-l gpu=1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
 mkdir -p exp/nnet2_online
 
-
 # Stages 1 through 5 are done in run_nnet2_common.sh,
 # so it can be shared with other similar scripts.
 local/dnn/run_nnet2_common.sh --stage $stage
 
 if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
-    utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-dsata/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
+    utils/create_split_dir.pl /export/b0{6,7,8,9}/$USER/kaldi-data/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
   # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
@@ -48,12 +44,12 @@ if [ $stage -le 6 ]; then
   # data across four filesystems for speed.
 
 
-  local/dnn/train_multisplice_accel2.sh --stage $train_stage \
+  sid/nnet2/train_multisplice_accel2.sh --stage $train_stage \
     --feat-type raw \
     --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer3/-3:3 layer4/-7:2" \
     --num-epochs 6 \
     --num-hidden-layers 6 \
-    --num-jobs-initial 3 --num-jobs-final 18 \
+    --num-jobs-initial 3 --num-jobs-final 8 \
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
diff --git a/egs/sre10/v1/local/dnn/train_dnn.sh b/egs/sre10/v1/local/dnn/train_dnn.sh
index d9330e58b69..33aed9abdc7 100755
--- a/egs/sre10/v1/local/dnn/train_dnn.sh
+++ b/egs/sre10/v1/local/dnn/train_dnn.sh
@@ -10,7 +10,6 @@
 . path.sh
 mfccdir=`pwd`/mfcc
 set -e
-
 # the next command produces the data in local/train_all_asr
 local/dnn/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
    /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
@@ -169,5 +168,5 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 # this will help find issues with the lexicon.
 # steps/cleanup/debug_lexicon.sh --nj 300 --cmd "$train_cmd" data/train_asr_100k data/lang exp/tri5a data/local/dict/lexicon.txt exp/debug_lexicon_100k
 
-## The following is based on the best current neural net recipe.
+# The following is based on an older nnet2 recipe.
 local/dnn/run_nnet2_multisplice.sh
diff --git a/egs/sre10/v1/local/plda_scoring.sh b/egs/sre10/v1/local/plda_scoring.sh
index ef17edb4e05..63d4a4f0d4c 100755
--- a/egs/sre10/v1/local/plda_scoring.sh
+++ b/egs/sre10/v1/local/plda_scoring.sh
@@ -5,6 +5,10 @@
 # This script trains PLDA models and does scoring.
 
 use_existing_models=false
+simple_length_norm=false # If true, replace the default length normalization
+                         # performed in PLDA  by an alternative that
+                         # normalizes the length of the iVectors to be equal
+                         # to the square root of the iVector dimension.
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -38,9 +42,10 @@ fi
 mkdir -p $scores_dir/log
 
 run.pl $scores_dir/log/plda_scoring.log \
-   ivector-plda-scoring --normalize-length=true \
-   --num-utts=ark:${enroll_ivec_dir}/num_utts.ark \
-   "ivector-copy-plda --smoothing=0.0 ${plda_ivec_dir}/plda - |" \
-   "ark:ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec scp:${enroll_ivec_dir}/spk_ivector.scp ark:- | ivector-normalize-length ark:- ark:- |" \
-   "ark:ivector-normalize-length scp:${test_ivec_dir}/ivector.scp ark:- | ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
-   "cat '$trials' | cut -d\  --fields=1,2 |" $scores_dir/plda_scores || exit 1;
+  ivector-plda-scoring --normalize-length=true \
+    --simple-length-normalization=$simple_length_norm \
+    --num-utts=ark:${enroll_ivec_dir}/num_utts.ark \
+    "ivector-copy-plda --smoothing=0.0 ${plda_ivec_dir}/plda - |" \
+    "ark:ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec scp:${enroll_ivec_dir}/spk_ivector.scp ark:- | ivector-normalize-length ark:- ark:- |" \
+    "ark:ivector-normalize-length scp:${test_ivec_dir}/ivector.scp ark:- | ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    "cat '$trials' | cut -d\  --fields=1,2 |" $scores_dir/plda_scores || exit 1;
diff --git a/egs/sre10/v1/run.sh b/egs/sre10/v1/run.sh
index ead66c3e160..4c5049a73bc 100755
--- a/egs/sre10/v1/run.sh
+++ b/egs/sre10/v1/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2015-2016   David Snyder
+# Copyright 2015-2017   David Snyder
 #                2015   Johns Hopkins University (Author: Daniel Garcia-Romero)
 #                2015   Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -71,30 +71,30 @@ utils/subset_data_dir.sh data/train 16000 data/train_16k
 utils/subset_data_dir.sh data/train 32000 data/train_32k
 
 # Train UBM and i-vector extractor.
-sid/train_diag_ubm.sh --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \
   --nj 20 --num-threads 8 \
   data/train_16k $num_components \
   exp/diag_ubm_$num_components
 
 sid/train_full_ubm.sh --nj 40 --remove-low-count-gaussians false \
-  --cmd "$train_cmd -l mem_free=25G,ram_free=25G" data/train_32k \
+  --cmd "$train_cmd --mem 25G" data/train_32k \
   exp/diag_ubm_$num_components exp/full_ubm_$num_components
 
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --ivector-dim 600 \
   --num-iters 5 exp/full_ubm_$num_components/final.ubm data/train \
   exp/extractor
 
 # Extract i-vectors.
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
   exp/extractor data/sre10_train \
   exp/ivectors_sre10_train
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
   exp/extractor data/sre10_test \
   exp/ivectors_sre10_test
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
   exp/extractor data/sre \
   exp/ivectors_sre
 
@@ -108,27 +108,28 @@ local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
 # best, so we don't focus on the scores obtained here.
 #
 # local/cosine_scoring.sh data/sre10_train data/sre10_test \
-#  exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
+#  exp/ivectors_sre10_train exp/ivectors_sre10_test $trials exp/scores_gmm_2048_ind_pooled
 # local/lda_scoring.sh data/sre data/sre10_train data/sre10_test \
-#  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
+#  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials exp/scores_gmm_2048_ind_pooled
 
 # Create a gender independent PLDA model and do scoring.
 local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
-  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
+  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials exp/scores_gmm_2048_ind_pooled
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
-  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female local/scores_gmm_2048_ind_female
+  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female exp/scores_gmm_2048_ind_female
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
-  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male local/scores_gmm_2048_ind_male
+  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male exp/scores_gmm_2048_ind_male
 
 # Create gender dependent PLDA models and do scoring.
 local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
-  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female local/scores_gmm_2048_dep_female
+  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female exp/scores_gmm_2048_dep_female
 local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
-  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male local/scores_gmm_2048_dep_male
+  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male exp/scores_gmm_2048_dep_male
 
-mkdir -p local/scores_gmm_2048_dep_pooled
-cat local/scores_gmm_2048_dep_male/plda_scores local/scores_gmm_2048_dep_female/plda_scores \
-  > local/scores_gmm_2048_dep_pooled/plda_scores
+# Pool the gender dependent results.
+mkdir -p exp/scores_gmm_2048_dep_pooled
+cat exp/scores_gmm_2048_dep_male/plda_scores exp/scores_gmm_2048_dep_female/plda_scores \
+  > exp/scores_gmm_2048_dep_pooled/plda_scores
 
 # GMM-2048 PLDA EER
 # ind pooled: 2.26
@@ -140,7 +141,7 @@ cat local/scores_gmm_2048_dep_male/plda_scores local/scores_gmm_2048_dep_female/
 echo "GMM-$num_components EER"
 for x in ind dep; do
   for y in female male pooled; do
-    eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_gmm_${num_components}_${x}_${y}/plda_scores) 2> /dev/null`
+    eer=`compute-eer <(python local/prepare_for_eer.py $trials exp/scores_gmm_${num_components}_${x}_${y}/plda_scores) 2> /dev/null`
     echo "${x} ${y}: $eer"
   done
 done
diff --git a/egs/sre10/v2/run.sh b/egs/sre10/v2/run.sh
index 4f5ab2756bb..b6c24fc1371 100755
--- a/egs/sre10/v2/run.sh
+++ b/egs/sre10/v2/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2015-2016   David Snyder
+# Copyright 2015-2017   David Snyder
 #                2015   Johns Hopkins University (Author: Daniel Garcia-Romero)
 #                2015   Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -105,62 +105,61 @@ utils/fix_data_dir.sh data/train_32k
 # Initialize a full GMM from the DNN posteriors and speaker recognition
 # features. This can be used both alone, as a UBM, or to initialize the
 # i-vector extractor in a DNN-based system.
-sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \
+sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd --mem 15G" \
   data/train_32k \
   data/train_dnn_32k $nnet exp/full_ubm
 
 # Train an i-vector extractor based on just the supervised-GMM.
 sid/train_ivector_extractor.sh \
-  --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \
+  --cmd "$train_cmd --mem 120G" \
   --ivector-dim 600 \
   --num-iters 5 exp/full_ubm/final.ubm data/train \
   exp/extractor_sup_gmm
 
 # Train an i-vector extractor based on the DNN-UBM.
 sid/train_ivector_extractor_dnn.sh \
-  --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
-  --min-post 0.015 \
-  --ivector-dim 600 \
-  --num-iters 5 exp/full_ubm/final.ubm $nnet \
+  --cmd "$train_cmd --mem 100G" --nnet-job-opt "--mem 4G" \
+  --min-post 0.015 --ivector-dim 600 --num-iters 5 \
+  exp/full_ubm/final.ubm $nnet \
   data/train \
   data/train_dnn \
   exp/extractor_dnn
 
 # Extract i-vectors from the extractor with the sup-GMM UBM.
 sid/extract_ivectors.sh \
-  --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
+  --cmd "$train_cmd --mem 12G" --nj 40 \
   exp/extractor_sup_gmm data/sre10_train \
   exp/ivectors_sre10_train_sup_gmm
 
 sid/extract_ivectors.sh \
-  --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
+  --cmd "$train_cmd --mem 12G" --nj 40 \
   exp/extractor_sup_gmm data/sre10_test \
   exp/ivectors_sre10_test_sup_gmm
 
 sid/extract_ivectors.sh \
-  --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
+  --cmd "$train_cmd --mem 12G" --nj 40 \
   exp/extractor_sup_gmm data/sre \
   exp/ivectors_sre_sup_gmm
 
 # Extract i-vectors using the extractor with the DNN-UBM.
 sid/extract_ivectors_dnn.sh \
-  --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+  --cmd "$train_cmd --mem 15G" --nj 10 \
   exp/extractor_dnn \
   $nnet \
   data/sre10_test \
   data/sre10_test_dnn \
   exp/ivectors10_test_dnn
 
-sid/extract_ivectors_dnn.sh
-  --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+sid/extract_ivectors_dnn.sh \
+   --cmd "$train_cmd --mem 15G" --nj 10 \
   exp/extractor_dnn \
   $nnet \
   data/sre10_train \
   data/sre10_train_dnn \
   exp/ivectors10_train_dnn
 
-sid/extract_ivectors_dnn.sh
-  --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+sid/extract_ivectors_dnn.sh \
+  --cmd "$train_cmd --mem 15G" --nj 10 \
   exp/extractor_dnn \
   $nnet \
   data/sre \
@@ -183,87 +182,90 @@ local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
 #
 # local/cosine_scoring.sh data/sre10_train data/sre10_test \
 #   exp/ivectors_sre10_train exp/ivectors_sre10_test $trials \
-#   local/scores_gmm_2048_ind_pooled
+#   exp/scores_gmm_2048_ind_pooled
 # local/lda_scoring.sh data/sre data/sre10_train data/sre10_test \
 #   exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test \
-#   $trials local/scores_gmm_2048_ind_pooled
+#   $trials exp/scores_gmm_2048_ind_pooled
 
 # Create a gender independent PLDA model and do scoring with the sup-GMM system.
 local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm \
-  exp/ivectors_sre10_test_sup_gmm $trials local/scores_sup_gmm_ind_pooled
+  exp/ivectors_sre10_test_sup_gmm $trials exp/scores_sup_gmm_ind_pooled
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
-  exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_ind_female
+  exp/ivectors_sre10_test_sup_gmm_female $trials_female exp/scores_sup_gmm_ind_female
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
-  exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_ind_male
+  exp/ivectors_sre10_test_sup_gmm_male $trials_male exp/scores_sup_gmm_ind_male
 
 # Create gender dependent PLDA models and do scoring with the sup-GMM system.
 local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
-  exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_dep_female
+  exp/ivectors_sre10_test_sup_gmm_female $trials_female exp/scores_sup_gmm_dep_female
 local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
-  exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_dep_male
-mkdir -p local/scores_sup_gmm_dep_pooled
-cat local/scores_sup_gmm_dep_male/plda_scores local/scores_sup_gmm_dep_female/plda_scores \
-  > local/scores_sup_gmm_dep_pooled/plda_scores
+  exp/ivectors_sre10_test_sup_gmm_male $trials_male exp/scores_sup_gmm_dep_male
+
+# Pool the gender dependent results
+mkdir -p exp/scores_sup_gmm_dep_pooled
+cat exp/scores_sup_gmm_dep_male/plda_scores exp/scores_sup_gmm_dep_female/plda_scores \
+  > exp/scores_sup_gmm_dep_pooled/plda_scores
 
 # Create a gender independent PLDA model and do scoring with the DNN system.
 local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn \
-  exp/ivectors_sre10_test_dnn $trials local/scores_dnn_ind_pooled
+  exp/ivectors_sre10_test_dnn $trials exp/scores_dnn_ind_pooled
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
-  exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_ind_female
+  exp/ivectors_sre10_test_dnn_female $trials_female exp/scores_dnn_ind_female
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
-  exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_ind_male
+  exp/ivectors_sre10_test_dnn_male $trials_male exp/scores_dnn_ind_male
 
 # Create gender dependent PLDA models and do scoring with the DNN system.
 local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
-  exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_dep_female
+  exp/ivectors_sre10_test_dnn_female $trials_female exp/scores_dnn_dep_female
 local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
-  exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_dep_male
-mkdir -p local/scores_dnn_dep_pooled
-cat local/scores_dnn_dep_male/plda_scores local/scores_dnn_dep_female/plda_scores \
-  > local/scores_dnn_dep_pooled/plda_scores
+  exp/ivectors_sre10_test_dnn_male $trials_male exp/scores_dnn_dep_male
+
+mkdir -p exp/scores_dnn_dep_pooled
+cat exp/scores_dnn_dep_male/plda_scores exp/scores_dnn_dep_female/plda_scores \
+  > exp/scores_dnn_dep_pooled/plda_scores
 
 # Sup-GMM PLDA EER
 # ind pooled: 1.72
 # ind female: 1.81
-# ind male:   1.56
-# dep female: 1.89
-# dep male:   1.39
-# dep pooled: 1.65
-echo "Sup-GMM-$num_components EER"
+# ind male:   1.70
+# dep female: 2.03
+# dep male:   1.50
+# dep pooled: 1.79
+echo "Sup-GMM EER"
 for x in ind dep; do
   for y in female male pooled; do
-    eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_sup_gmm_${x}_${y}/plda_scores) 2> /dev/null`
+    eer=`compute-eer <(python local/prepare_for_eer.py $trials exp/scores_sup_gmm_${x}_${y}/plda_scores) 2> /dev/null`
     echo "${x} ${y}: $eer"
   done
 done
 
-# DNN PLDA EER
-# ind pooled: 1.05
-# ind female: 1.33
-# ind male:   0.75
-# dep female: 1.41
-# dep male:   0.64
-# dep pooled: 1.02
-echo "DNN-$num_components EER"
+# DNN-UBM EER
+# ind pooled: 1.01
+# ind female: 1.16
+# ind male:   0.78
+# dep female: 1.27
+# dep male:   0.61
+# dep pooled: 0.96
+echo "DNN-UBM EER"
 for x in ind dep; do
   for y in female male pooled; do
-    eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_dnn_${x}_${y}/plda_scores) 2> /dev/null`
+    eer=`compute-eer <(python local/prepare_for_eer.py $trials exp/scores_dnn_${x}_${y}/plda_scores) 2> /dev/null`
     echo "${x} ${y}: $eer"
   done
 done
 
 # In comparison, here is the EER for an unsupervised GMM-based system
-# with 5297 components (the same as the number of senones in the DNN):
+# with 5297 components (about the same as the number of senones in the DNN):
 # GMM-5297 PLDA EER
 # ind pooled: 2.25
 # ind female: 2.33
diff --git a/src/fgmmbin/fgmm-global-init-from-accs.cc b/src/fgmmbin/fgmm-global-init-from-accs.cc
index 23dc6be75cf..70b43e05d11 100644
--- a/src/fgmmbin/fgmm-global-init-from-accs.cc
+++ b/src/fgmmbin/fgmm-global-init-from-accs.cc
@@ -1,8 +1,8 @@
 // fgmmbin/fgmm-global-init-from-accs.cc
 
-// Copyright 2015 David Snyder
-//           2015 Johns Hopkins University (Author: Daniel Povey)
-//           2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
+// Copyright 2015-2017 David Snyder
+//                2015 Johns Hopkins University (Author: Daniel Povey)
+//                2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
     }
 
     int32 num_gauss = gmm_accs.NumGauss(), dim = gmm_accs.Dim(),
-          tot_floored = 0, gauss_floored = 0;
+          tot_floored = 0, gauss_floored = 0, tot_low_occ = 0;
 
     FullGmm fgmm(num_components, dim);
 
@@ -69,23 +69,30 @@ int main(int argc, char *argv[]) {
     Matrix<BaseFloat> means(num_gauss, dim);
     std::vector<SpMatrix<BaseFloat> > invcovars;
 
-    BaseFloat occ_sum = gmm_accs.occupancy().Sum();
     for (int32 i = 0; i < num_components; i++) {
-      BaseFloat occ = gmm_accs.occupancy()(i),
-                prob;
-      if (occ_sum > 0.0)
-        prob = occ / occ_sum;
-      else
-        prob = 1.0 / num_gauss;
-      weights(i) = prob;
-
-      Vector<BaseFloat> mean(gmm_accs.mean_accumulator().Row(i));
-      mean.Scale(1.0 / occ);
+      BaseFloat occ = gmm_accs.occupancy()(i);
+      weights(i) = occ;
+      Vector<BaseFloat> mean(dim, kSetZero);
+      SpMatrix<BaseFloat> covar(dim, kSetZero);
+
+      // If the occupancy for a Gaussian is very low, set it to a small value.
+      if (occ < 1e-10) {
+        weights(i) = 1e-10;
+        mean.SetRandn();
+        Vector<BaseFloat> diag(mean.Dim());
+        diag.Set(1.0);
+        covar.AddDiagVec(1.0, diag);
+        tot_low_occ++;
+      // This is the typical case.
+      } else {
+        mean.CopyRowFromMat(gmm_accs.mean_accumulator(), i);
+        mean.Scale(1.0 / occ);
+        covar.CopyFromSp(gmm_accs.covariance_accumulator()[i]);
+        covar.Scale(1.0 / occ);
+        covar.AddVec2(-1.0, mean);  // subtract squared means.
+      }
       means.CopyRowFromVec(mean, i);
 
-      SpMatrix<BaseFloat> covar(gmm_accs.covariance_accumulator()[i]);
-      covar.Scale(1.0 / occ);
-      covar.AddVec2(-1.0, means.Row(i));  // subtract squared means.
       // Floor variance Eigenvalues.
       BaseFloat floor = std::max(
           static_cast<BaseFloat>(gmm_opts.variance_floor),
@@ -98,14 +105,21 @@ int main(int argc, char *argv[]) {
       covar.InvertDouble();
       invcovars.push_back(covar);
     }
+    weights.Scale(1.0 / weights.Sum());
     fgmm.SetWeights(weights);
     fgmm.SetInvCovarsAndMeans(invcovars, means);
     int32 num_bad = fgmm.ComputeGconsts();
     KALDI_LOG << "FullGmm has " << num_bad << " bad GConsts";
+
     if (tot_floored > 0) {
       KALDI_WARN << tot_floored << " variances floored in " << gauss_floored
                  << " Gaussians.";
     }
+    if (tot_low_occ > 0) {
+      KALDI_WARN << tot_low_occ << " out of " << num_gauss
+                 << " Gaussians had very low occupancy.";
+    }
+
     WriteKaldiObject(fgmm, model_out_filename, binary_write);
 
     KALDI_LOG << "Written model to " << model_out_filename;
diff --git a/src/gmm/full-gmm.cc b/src/gmm/full-gmm.cc
index 7851d8648f7..0f634eeee6b 100644
--- a/src/gmm/full-gmm.cc
+++ b/src/gmm/full-gmm.cc
@@ -113,7 +113,7 @@ int32 FullGmm::ComputeGconsts() {
     // So gc is the likelihood at zero feature value.
 
     if (KALDI_ISNAN(gc)) {  // negative infinity is OK but NaN is not acceptable
-      KALDI_ERR << "At component" << mix
+      KALDI_ERR << "At component " << mix
                 << ", not a number in gconst computation";
     }
     if (KALDI_ISINF(gc)) {
@@ -687,7 +687,7 @@ BaseFloat FullGmm::GaussianSelectionPreselect(
   }
   Vector<BaseFloat> loglikes(preselect_sz);
   LogLikelihoodsPreselect(data, preselect, &loglikes);
-  
+
   Vector<BaseFloat> loglikes_copy(loglikes);
   BaseFloat *ptr = loglikes_copy.Data();
   std::nth_element(ptr, ptr+preselect_sz-this_num_gselect,
diff --git a/src/ivectorbin/ivector-mean.cc b/src/ivectorbin/ivector-mean.cc
index 9db070d61ab..6e6117c1eb7 100644
--- a/src/ivectorbin/ivector-mean.cc
+++ b/src/ivectorbin/ivector-mean.cc
@@ -42,14 +42,14 @@ int main(int argc, char *argv[]) {
         "e.g.: ivector-mean data/spk2utt exp/ivectors.ark exp/spk_ivectors.ark exp/spk_num_utts.ark\n"
         "or: ivector-mean exp/ivectors.ark exp/mean.vec\n"
         "See also: ivector-subtract-global-mean\n";
-    
+
     ParseOptions po(usage);
     bool binary_write = false;
     po.Register("binary", &binary_write, "If true, write output in binary "
                 "(only applicable when writing files, not archives/tables.");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() < 2 || po.NumArgs() > 4) {
       po.PrintUsage();
       exit(1);
@@ -79,10 +79,10 @@ int main(int argc, char *argv[]) {
           ivector_rspecifier = po.GetArg(2),
           ivector_wspecifier = po.GetArg(3),
           num_utts_wspecifier = po.GetOptArg(4);
-    
+
       double spk_sumsq = 0.0;
       Vector<double> spk_sum;
-    
+
       int64 num_spk_done = 0, num_spk_err = 0,
           num_utt_done = 0, num_utt_err = 0;
 
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
       SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
       BaseFloatVectorWriter ivector_writer(ivector_wspecifier);
       Int32Writer num_utts_writer(num_utts_wspecifier);
-    
+
       for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
         std::string spk = spk2utt_reader.Key();
         const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
           spk_sum.AddVec(1.0, spk_mean);
         }
       }
-      
+
       KALDI_LOG << "Computed mean of " << num_spk_done << " speakers ("
                 << num_spk_err << " with no utterances), consisting of "
                 << num_utt_done << " utterances (" << num_utt_err

From 3616ff86d1b4ff343031fbd474e481d800cb33e5 Mon Sep 17 00:00:00 2001
From: Nick <weegreenblobbie2@gmail.com>
Date: Fri, 10 Feb 2017 14:04:53 -0800
Subject: [PATCH 002/213] [egs]
 egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh, change default stage to 0
 (#1416)

---
 egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
index 158a5148fb5..d9ca900ac63 100755
--- a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
+++ b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
@@ -3,7 +3,7 @@
 . cmd.sh
 
 
-stage=6
+stage=0
 train_stage=451
 use_gpu=true
 rescore=true

From e1083d973843700fc1a42297d9b8e605c54f4957 Mon Sep 17 00:00:00 2001
From: schemreier <emrey@kth.se>
Date: Sat, 11 Feb 2017 18:36:21 +0100
Subject: [PATCH 003/213] [egs] Add example scripts for Frisian-Dutch language
 (FAME! corpus)

---
 egs/fame/README.txt                     |  15 +++
 egs/fame/s5/RESULTS                     |  28 ++++++
 egs/fame/s5/cmd.sh                      |   1 +
 egs/fame/s5/conf/decode_dnn.config      |   2 +
 egs/fame/s5/conf/fbank.conf             |   2 +
 egs/fame/s5/conf/mfcc.conf              |   1 +
 egs/fame/s5/conf/mfcc_hires.conf        |  10 ++
 egs/fame/s5/conf/online_cmvn.conf       |   1 +
 egs/fame/s5/local/fame_data_prep.sh     |  53 ++++++++++
 egs/fame/s5/local/fame_dict_prep.sh     |  36 +++++++
 egs/fame/s5/local/nnet/run_dnn.sh       | 120 ++++++++++++++++++++++
 egs/fame/s5/local/nnet/run_dnn_fbank.sh | 125 +++++++++++++++++++++++
 egs/fame/s5/local/score.sh              |   1 +
 egs/fame/s5/local/wer_hyp_filter        |   2 +
 egs/fame/s5/local/wer_output_filter     |   2 +
 egs/fame/s5/local/wer_ref_filter        |   2 +
 egs/fame/s5/path.sh                     |   6 ++
 egs/fame/s5/run.sh                      | 127 ++++++++++++++++++++++++
 egs/fame/s5/steps                       |   1 +
 egs/fame/s5/utils                       |   1 +
 20 files changed, 536 insertions(+)
 create mode 100644 egs/fame/README.txt
 create mode 100644 egs/fame/s5/RESULTS
 create mode 120000 egs/fame/s5/cmd.sh
 create mode 100644 egs/fame/s5/conf/decode_dnn.config
 create mode 100644 egs/fame/s5/conf/fbank.conf
 create mode 100644 egs/fame/s5/conf/mfcc.conf
 create mode 100644 egs/fame/s5/conf/mfcc_hires.conf
 create mode 100644 egs/fame/s5/conf/online_cmvn.conf
 create mode 100755 egs/fame/s5/local/fame_data_prep.sh
 create mode 100755 egs/fame/s5/local/fame_dict_prep.sh
 create mode 100755 egs/fame/s5/local/nnet/run_dnn.sh
 create mode 100755 egs/fame/s5/local/nnet/run_dnn_fbank.sh
 create mode 120000 egs/fame/s5/local/score.sh
 create mode 100755 egs/fame/s5/local/wer_hyp_filter
 create mode 100755 egs/fame/s5/local/wer_output_filter
 create mode 100755 egs/fame/s5/local/wer_ref_filter
 create mode 100755 egs/fame/s5/path.sh
 create mode 100755 egs/fame/s5/run.sh
 create mode 120000 egs/fame/s5/steps
 create mode 120000 egs/fame/s5/utils

diff --git a/egs/fame/README.txt b/egs/fame/README.txt
new file mode 100644
index 00000000000..d2ed39eef75
--- /dev/null
+++ b/egs/fame/README.txt
@@ -0,0 +1,15 @@
+The FAME! Speech Corpus
+
+The components of the Frisian data collection are speech and language resources gathered for building a large vocabulary ASR system for the Frisian language. Firstly, a new broadcast database is created by collecting recordings from the archives of the regional broadcaster Omrop Fryslân, and annotating them with various information such as the language switches and speaker details. The second component of this collection is a language model created on a text corpus with diverse vocabulary. Thirdly, a Frisian phonetic dictionary with the mappings between the Frisian words and phones is built to make the ASR viable for this under-resourced language. Finally, an ASR recipe is provided which uses all previous resources to perform recognition and present the recognition performances.
+
+The Corpus consists of short utterances extracted from 203 audio segments of approximately 5 minutes long which are parts of various radio programs covering a time span of almost 50 years (1966-2015), adding a longitudinal dimension to the database. The content of the recordings are very diverse including radio programs about culture, history, literature, sports, nature, agriculture, politics, society and languages. The total duration of the manually annotated radio broadcasts sums up to 18 hours, 33 minutes and 57 seconds. The stereo audio data has a sampling frequency of 48 kHz and 16-bit resolution per sample. The available meta-information helped the annotators to identify these speakers and mark them either using their names or the same label (if the name is not known). There are 309 identified speakers in the FAME! Speech Corpus, 21 of whom appear at least 3 times in the database. These speakers are mostly program presenters and celebrities appearing multiple times in different recordings over years. There are 233 unidentified speakers due to lack of meta-information. The total number of word- and sentence-level code-switching cases in the FAME! Speech Corpus is equal to 3837. Music portions have been removed, except where these overlap with speech.
+
+A full description of the FAME! Speech Corpus is provided in:
+
+Yilmaz, E., Heuvel, H. van den, Van de Velde, H., Kampstra, F., Algra, J., Leeuwen, D. van:
+
+Open Source Speech and Language Resources for Frisian Language.
+
+In: Proceedings Interspeech 2016, pp. 1536--1540, 8-12 September 2016, San Francisco
+
+Please check http://www.ru.nl/clst/datasets/ to get the FAME! Speech Corpus
diff --git a/egs/fame/s5/RESULTS b/egs/fame/s5/RESULTS
new file mode 100644
index 00000000000..a8541fba6b5
--- /dev/null
+++ b/egs/fame/s5/RESULTS
@@ -0,0 +1,28 @@
+%WER 41.10 [ 4974 / 12101, 522 ins, 1223 del, 3229 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_devel/wer_11_0.0
+%WER 38.10 [ 4909 / 12886, 527 ins, 1220 del, 3162 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_test/wer_11_0.0
+%WER 41.06 [ 4969 / 12101, 514 ins, 1277 del, 3178 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it1/wer_11_0.0
+%WER 40.38 [ 4886 / 12101, 515 ins, 1225 del, 3146 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it3/wer_11_0.0
+%WER 40.15 [ 4859 / 12101, 514 ins, 1177 del, 3168 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it6/wer_10_0.5
+%WER 37.86 [ 4879 / 12886, 596 ins, 1083 del, 3200 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it1/wer_10_0.0
+%WER 37.16 [ 4789 / 12886, 592 ins, 1056 del, 3141 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it3/wer_10_0.0
+%WER 36.92 [ 4757 / 12886, 618 ins, 1010 del, 3129 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it6/wer_10_0.0
+%WER 42.38 [ 5129 / 12101, 576 ins, 1171 del, 3382 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode_devel/wer_11_0.0
+%WER 39.14 [ 5043 / 12886, 536 ins, 1172 del, 3335 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode_test/wer_11_0.0
+%WER 42.05 [ 5088 / 12101, 525 ins, 1282 del, 3281 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it1/wer_11_0.0
+%WER 41.41 [ 5011 / 12101, 461 ins, 1345 del, 3205 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it3/wer_11_0.5
+%WER 40.97 [ 4958 / 12101, 485 ins, 1279 del, 3194 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it6/wer_11_0.5
+%WER 38.79 [ 4998 / 12886, 512 ins, 1194 del, 3292 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it1/wer_11_0.0
+%WER 38.16 [ 4917 / 12886, 544 ins, 1128 del, 3245 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it3/wer_11_0.0
+%WER 37.68 [ 4856 / 12886, 564 ins, 1068 del, 3224 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it6/wer_11_0.0
+%WER 70.85 [ 8574 / 12101, 414 ins, 2596 del, 5564 sub ] exp/mono/decode_devel/wer_9_0.0
+%WER 68.17 [ 8785 / 12886, 413 ins, 2704 del, 5668 sub ] exp/mono/decode_test/wer_9_0.0
+%WER 44.05 [ 5330 / 12101, 560 ins, 1467 del, 3303 sub ] exp/sgmm2/decode_devel/wer_10_0.0
+%WER 40.22 [ 5183 / 12886, 680 ins, 1142 del, 3361 sub ] exp/sgmm2/decode_test/wer_9_0.0
+%WER 54.39 [ 6582 / 12101, 695 ins, 1595 del, 4292 sub ] exp/tri1/decode_devel/wer_10_0.0
+%WER 51.60 [ 6649 / 12886, 630 ins, 1706 del, 4313 sub ] exp/tri1/decode_test/wer_11_0.0
+%WER 51.53 [ 6236 / 12101, 659 ins, 1675 del, 3902 sub ] exp/tri2/decode_devel/wer_11_0.0
+%WER 48.32 [ 6226 / 12886, 643 ins, 1669 del, 3914 sub ] exp/tri2/decode_test/wer_12_0.0
+%WER 47.15 [ 5706 / 12101, 580 ins, 1537 del, 3589 sub ] exp/tri3/decode_devel/wer_13_0.0
+%WER 52.13 [ 6308 / 12101, 623 ins, 1706 del, 3979 sub ] exp/tri3/decode_devel.si/wer_11_0.5
+%WER 43.71 [ 5632 / 12886, 594 ins, 1538 del, 3500 sub ] exp/tri3/decode_test/wer_14_0.0
+%WER 48.21 [ 6212 / 12886, 825 ins, 1358 del, 4029 sub ] exp/tri3/decode_test.si/wer_10_0.0
diff --git a/egs/fame/s5/cmd.sh b/egs/fame/s5/cmd.sh
new file mode 120000
index 00000000000..19f7e836644
--- /dev/null
+++ b/egs/fame/s5/cmd.sh
@@ -0,0 +1 @@
+../../wsj/s5/cmd.sh
\ No newline at end of file
diff --git a/egs/fame/s5/conf/decode_dnn.config b/egs/fame/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/fame/s5/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/fame/s5/conf/fbank.conf b/egs/fame/s5/conf/fbank.conf
new file mode 100644
index 00000000000..c4b73674cab
--- /dev/null
+++ b/egs/fame/s5/conf/fbank.conf
@@ -0,0 +1,2 @@
+# No non-default options for now.
+
diff --git a/egs/fame/s5/conf/mfcc.conf b/egs/fame/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/fame/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/fame/s5/conf/mfcc_hires.conf b/egs/fame/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/fame/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/fame/s5/conf/online_cmvn.conf b/egs/fame/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/fame/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/fame/s5/local/fame_data_prep.sh b/egs/fame/s5/local/fame_data_prep.sh
new file mode 100755
index 00000000000..2c2d1e79238
--- /dev/null
+++ b/egs/fame/s5/local/fame_data_prep.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+
+# Apache 2.0
+
+corpus=$1
+set -e -o pipefail
+if [ -z "$corpus" ] ; then
+    echo >&2 "The script $0 expects one parameter -- the location of the FAME! speech database"
+    exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+    echo >&2 "The directory $corpus does not exist"
+fi
+
+echo "Preparing train, development and test data"
+mkdir -p data data/local data/train data/devel data/test
+
+for x in train devel test; do
+    echo "Copy spk2utt, utt2spk, wav.scp, text for $x"
+    cp $corpus/data/$x/text     data/$x/text    || exit 1;
+    cp $corpus/data/$x/spk2utt  data/$x/spk2utt || exit 1;
+    cp $corpus/data/$x/utt2spk  data/$x/utt2spk || exit 1;
+
+    # the corpus wav.scp contains physical paths, so we just re-generate
+    # the file again from scratchn instead of figuring out how to edit it
+    for rec in $(awk '{print $1}' $corpus/data/$x/text) ; do
+        spk=${rec%_*}
+        filename=$corpus/fame/wav/${x}/${rec:8}.wav
+        if [ ! -f "$filename" ] ; then
+            echo >&2 "The file $filename could not be found ($rec)"
+            exit 1
+        fi
+        # we might want to store physical paths as a general rule
+        filename=$(readlink -f $filename)
+        echo "$rec $filename"
+    done > data/$x/wav.scp
+
+    # fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+    # duplicate entries and so on). Also, it regenerates the spk2utt from
+    # utt2sp
+    utils/fix_data_dir.sh data/$x
+done
+
+echo "Copying language model"
+if [ -f $corpus/lm/LM_FR_IKN3G ] ; then
+    gzip -c $corpus/lm/LM_FR_IKN3G > data/local/LM.gz
+fi
+
+echo "Data preparation completed."
+
diff --git a/egs/fame/s5/local/fame_dict_prep.sh b/egs/fame/s5/local/fame_dict_prep.sh
new file mode 100755
index 00000000000..c6530217a67
--- /dev/null
+++ b/egs/fame/s5/local/fame_dict_prep.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+
+# Apache 2.0
+
+corpus=$1
+if [ -z "$corpus" ] ; then
+    echo >&2 "The script $0 expects one parameter -- the location of the Iban corpus"
+    exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+    echo >&2 "The directory $corpus does not exist"
+fi
+
+mkdir -p data/lang data/local/dict
+
+
+cat $corpus/lexicon/lex.asr $corpus/lexicon/lex.oov > data/local/dict/lexicon.txt
+echo "!SIL	SIL" >> data/local/dict/lexicon.txt
+echo "<UNK>	SPN" >> data/local/dict/lexicon.txt
+env LC_ALL=C sort -u -o data/local/dict/lexicon.txt data/local/dict/lexicon.txt
+cat data/local/dict/lexicon.txt | \
+    perl -ane 'print join("\n", @F[1..$#F]) . "\n"; '  | \
+    sort -u | grep -v 'SIL' > data/local/dict/nonsilence_phones.txt
+
+
+touch data/local/dict/extra_questions.txt
+touch data/local/dict/optional_silence.txt
+
+echo "SIL"   > data/local/dict/optional_silence.txt
+echo "SIL"   > data/local/dict/silence_phones.txt
+echo "<UNK>" > data/local/dict/oov.txt
+
+echo "Dictionary preparation succeeded"
diff --git a/egs/fame/s5/local/nnet/run_dnn.sh b/egs/fame/s5/local/nnet/run_dnn.sh
new file mode 100755
index 00000000000..ca1efa5e0ac
--- /dev/null
+++ b/egs/fame/s5/local/nnet/run_dnn.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Brno University of Technology (Author: Karel Vesely)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+# Apache 2.0
+
+# This example script trains a DNN on top of fMLLR features. 
+# The training is done in 3 stages,
+#
+# 1) RBM pre-training:
+#    in this unsupervised stage we train stack of RBMs, 
+#    a good starting point for frame cross-entropy trainig.
+# 2) frame cross-entropy training:
+#    the objective is to classify frames to correct pdfs.
+# 3) sequence-training optimizing sMBR: 
+#    the objective is to emphasize state-sequences with better 
+#    frame accuracy w.r.t. reference alignment.
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+set -eu
+
+# Config:
+gmm=exp/tri3
+data_fmllr=data-fmllr-tri3
+stage=0 # resume training with --stage=N
+# End of config.
+. utils/parse_options.sh
+#
+
+[ ! -e $data_fmllr/test ] && if [ $stage -le 0 ]; then
+  # Store fMLLR features, so we can train on them easily,
+  # devel
+  dir=$data_fmllr/devel
+  steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+     --transform-dir $gmm/decode_devel \
+     $dir data/devel $gmm $dir/log $dir/data
+  # test
+  dir=$data_fmllr/test
+  steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+     --transform-dir $gmm/decode_test \
+     $dir data/test $gmm $dir/log $dir/data
+  # train
+  dir=$data_fmllr/train
+  steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+     --transform-dir ${gmm}_ali \
+     $dir data/train $gmm $dir/log $dir/data
+  # split the data : 90% train 10% cross-validation (held-out)
+  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10
+fi
+
+if [ $stage -le 1 ]; then
+  # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
+  dir=exp/dnn4b_pretrain-dbn
+  $cuda_cmd $dir/log/pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh --hid-dim 2048 --rbm-iter 10 $data_fmllr/train $dir
+fi
+
+if [ $stage -le 2 ]; then
+  # Train the DNN optimizing per-frame cross-entropy.
+  dir=exp/dnn4b_pretrain-dbn_dnn
+  ali=${gmm}_ali
+  feature_transform=exp/dnn4b_pretrain-dbn/final.feature_transform
+  dbn=exp/dnn4b_pretrain-dbn/6.dbn
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $data_fmllr/devel $dir/decode_devel
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $data_fmllr/test $dir/decode_test
+fi
+
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+dir=exp/dnn4b_pretrain-dbn_dnn_smbr
+srcdir=exp/dnn4b_pretrain-dbn_dnn
+acwt=0.1
+
+if [ $stage -le 3 ]; then
+  # First we generate lattices and alignments:
+  steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
+    $data_fmllr/train data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $data_fmllr/train data/lang $srcdir ${srcdir}_denlats
+fi
+
+if [ $stage -le 4 ]; then
+  # Re-train the DNN by 6 iterations of sMBR 
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
+    $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
+  # Decode
+  for ITER in 6 3 1; do
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $data_fmllr/devel $dir/decode_devel_it${ITER}
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $data_fmllr/test $dir/decode_test_it${ITER}
+  done 
+fi
+
+echo Success
+exit 0
+
+# Getting results [see RESULTS file]
+# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+
+# to see how model conversion to nnet2 works, run run_dnn_convert_nnet2.sh at this point.
+
diff --git a/egs/fame/s5/local/nnet/run_dnn_fbank.sh b/egs/fame/s5/local/nnet/run_dnn_fbank.sh
new file mode 100755
index 00000000000..a81449ffbcf
--- /dev/null
+++ b/egs/fame/s5/local/nnet/run_dnn_fbank.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Brno University of Technology (Author: Karel Vesely)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+# Apache 2.0
+
+# This example script trains a DNN on top of FBANK features. 
+# The training is done in 3 stages,
+#
+# 1) RBM pre-training:
+#    in this unsupervised stage we train stack of RBMs, 
+#    a good starting point for frame cross-entropy trainig.
+# 2) frame cross-entropy training:
+#    the objective is to classify frames to correct pdfs.
+# 3) sequence-training optimizing sMBR: 
+#    the objective is to emphasize state-sequences with better 
+#    frame accuracy w.r.t. reference alignment.
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+dev=data-fbank/devel
+tst=data-fbank/test
+train=data-fbank/train
+
+dev_original=data/devel
+tst_original=data/test
+train_original=data/train
+
+gmm=exp/tri3
+
+stage=0
+. utils/parse_options.sh || exit 1;
+
+set -eu
+
+# Make the FBANK features
+[ ! -e $dev ] && if [ $stage -le 0 ]; then
+  # Dev set
+  utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $dev $dev/log $dev/data || exit 1;
+  steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
+  # Test set
+  utils/copy_data_dir.sh $tst_original $tst || exit 1; rm $tst/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $tst $tst/log $tst/data || exit 1;
+  steps/compute_cmvn_stats.sh $tst $tst/log $tst/data || exit 1;
+  # Training set
+  utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $train $train/log $train/data || exit 1;
+  steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
+  # Split the training set
+  utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
+fi
+
+if [ $stage -le 1 ]; then
+  # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
+  dir=exp/dnn4d-fbank_pretrain-dbn
+  $cuda_cmd $dir/log/pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh \
+      --cmvn-opts "--norm-means=true --norm-vars=true" \
+      --delta-opts "--delta-order=2" --splice 5 \
+      --hid-dim 2048 --rbm-iter 10 $train $dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Train the DNN optimizing per-frame cross-entropy.
+  dir=exp/dnn4d-fbank_pretrain-dbn_dnn
+  ali=${gmm}_ali
+  feature_transform=exp/dnn4d-fbank_pretrain-dbn/final.feature_transform
+  dbn=exp/dnn4d-fbank_pretrain-dbn/6.dbn
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $dev $dir/decode_devel || exit 1;
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $tst $dir/decode_test || exit 1;
+fi
+
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+dir=exp/dnn4d-fbank_pretrain-dbn_dnn_smbr
+srcdir=exp/dnn4d-fbank_pretrain-dbn_dnn
+acwt=0.1
+
+if [ $stage -le 3 ]; then
+  # First we generate lattices and alignments:
+  steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
+    $train data/lang $srcdir ${srcdir}_ali || exit 1;
+  steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $train data/lang $srcdir ${srcdir}_denlats || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # Re-train the DNN by 6 iterations of sMBR 
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
+    $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+  # Decode
+  for ITER in 6 3 1; do
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $dev $dir/decode_devel_it${ITER} || exit 1
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $tst $dir/decode_test_it${ITER} || exit 1
+  done 
+fi
+
+echo Success
+exit 0
+
+# Getting results [see RESULTS file]
+# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/fame/s5/local/score.sh b/egs/fame/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/fame/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/fame/s5/local/wer_hyp_filter b/egs/fame/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..372d1a9c73a
--- /dev/null
+++ b/egs/fame/s5/local/wer_hyp_filter
@@ -0,0 +1,2 @@
+#!/bin/sed -f
+s:<UNK>::g
diff --git a/egs/fame/s5/local/wer_output_filter b/egs/fame/s5/local/wer_output_filter
new file mode 100755
index 00000000000..372d1a9c73a
--- /dev/null
+++ b/egs/fame/s5/local/wer_output_filter
@@ -0,0 +1,2 @@
+#!/bin/sed -f
+s:<UNK>::g
diff --git a/egs/fame/s5/local/wer_ref_filter b/egs/fame/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..372d1a9c73a
--- /dev/null
+++ b/egs/fame/s5/local/wer_ref_filter
@@ -0,0 +1,2 @@
+#!/bin/sed -f
+s:<UNK>::g
diff --git a/egs/fame/s5/path.sh b/egs/fame/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/fame/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/fame/s5/run.sh b/egs/fame/s5/run.sh
new file mode 100755
index 00000000000..26a8485ff7d
--- /dev/null
+++ b/egs/fame/s5/run.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+stage=0
+feat_nj=10
+train_nj=10
+decode_nj=10
+famecorpus=./corpus
+
+if [ -d $famecorpus ] ; then
+  echo "Fame corpus present. OK."
+elif [ -f ./fame.tar.gz ] ; then
+  echo "Unpacking..."
+  tar xzf fame.tar.gz
+elif [ ! -d $famecorpus ] && [ ! -f ./fame.tar.gz ] ; then
+  echo "The Fame! corpus is not present. Please register here: http://www.ru.nl/clst/datasets/ "
+  echo " and download the corpus and put it at $famecorpus" && exit 1
+fi
+
+numLeavesTri1=5000
+numGaussTri1=25000
+numLeavesMLLT=5000
+numGaussMLLT=25000
+numLeavesSAT=5000
+numGaussSAT=25000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=20000
+
+if [ $stage -le 1 ]; then
+  local/fame_data_prep.sh $famecorpus || exit 1;
+  local/fame_dict_prep.sh $famecorpus || exit 1;
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang || exit 1;
+  utils/format_lm.sh data/lang data/local/LM.gz data/local/dict/lexicon.txt data/lang_test || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Feature extraction
+  for x in train devel test; do
+      steps/make_mfcc.sh --nj $feat_nj --cmd "$train_cmd" data/$x exp/make_mfcc/$x mfcc || exit 1;
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  ### Monophone
+  echo "Starting monophone training."
+  steps/train_mono.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/mono || exit 1;
+  echo "Mono training done."
+
+  echo "Decoding the development and test sets using monophone models."
+  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/mono/graph data/devel exp/mono/decode_devel || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/mono/graph data/test exp/mono/decode_test || exit 1;
+  echo "Monophone decoding done."
+fi
+
+
+if [ $stage -le 4 ]; then
+  ### Triphone
+  echo "Starting triphone training."
+  steps/align_si.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/mono exp/mono_ali || exit 1;
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd"  $numLeavesTri1 $numGaussTri1 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+  echo "Triphone training done."
+
+  echo "Decoding the development and test sets using triphone models."
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri1/graph data/devel exp/tri1/decode_devel || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri1/graph data/test exp/tri1/decode_test || exit 1;
+  echo "Triphone decoding done."
+fi
+
+if [ $stage -le 5 ]; then
+  ### Triphone + LDA and MLLT
+  echo "Starting LDA+MLLT training."
+  steps/align_si.sh  --nj $train_nj --cmd "$train_cmd"  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+  steps/train_lda_mllt.sh  --cmd "$train_cmd"  --splice-opts "--left-context=3 --right-context=3" $numLeavesMLLT $numGaussMLLT data/train data/lang  exp/tri1_ali exp/tri2 || exit 1;
+  echo "LDA+MLLT training done."
+
+  echo "Decoding the development and test sets using LDA+MLLT models."
+  utils/mkgraph.sh data/lang_test  exp/tri2 exp/tri2/graph || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri2/graph data/devel exp/tri2/decode_devel || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2/decode_test || exit 1;
+  echo "LDA+MLLT decoding done."
+fi
+
+
+if [ $stage -le 6 ]; then
+  ### Triphone + LDA and MLLT + SAT and FMLLR
+  echo "Starting SAT+FMLLR training."
+  steps/align_si.sh  --nj $train_nj --cmd "$train_cmd" --use-graphs true data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+  steps/train_sat.sh --cmd "$train_cmd" $numLeavesSAT $numGaussSAT data/train data/lang exp/tri2_ali exp/tri3 || exit 1;
+  echo "SAT+FMLLR training done."
+
+  echo "Decoding the development and test sets using SAT+FMLLR models."
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph || exit 1;
+  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri3/graph data/devel exp/tri3/decode_devel || exit 1;
+  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri3/graph data/test exp/tri3/decode_test || exit 1;
+  echo "SAT+FMLLR decoding done."
+fi
+
+
+if [ $stage -le 7 ]; then
+  echo "Starting SGMM training."
+  steps/align_fmllr.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/tri3 exp/tri3_ali || exit 1;
+  steps/train_ubm.sh  --cmd "$train_cmd" $numGaussUBM data/train data/lang exp/tri3_ali exp/ubm || exit 1;
+  steps/train_sgmm2.sh  --cmd "$train_cmd" $numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri3_ali exp/ubm/final.ubm exp/sgmm2 || exit 1;
+  echo "SGMM training done."
+
+  echo "Decoding the development and test sets using SGMM models"
+  utils/mkgraph.sh data/lang_test exp/sgmm2 exp/sgmm2/graph || exit 1;
+  steps/decode_sgmm2.sh --nj $decode_nj --cmd "$decode_cmd" --transform-dir exp/tri3/decode_devel exp/sgmm2/graph data/devel exp/sgmm2/decode_devel || exit 1;
+  steps/decode_sgmm2.sh --nj $decode_nj --cmd "$decode_cmd" --transform-dir exp/tri3/decode_test exp/sgmm2/graph data/test exp/sgmm2/decode_test || exit 1;
+  echo "SGMM decoding done."
+fi
+
+if [ $stage -le 8 ]; then
+  echo "Starting DNN training and decoding."
+  local/nnet/run_dnn.sh || exit 1;
+  local/nnet/run_dnn_fbank.sh || exit 1;
+fi
+
+#score
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/fame/s5/steps b/egs/fame/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/fame/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/fame/s5/utils b/egs/fame/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/fame/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file

From 80295c1e5c33bcec17059152db580704c5ca1c2b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 7 Dec 2016 21:13:33 -0500
Subject: [PATCH 004/213] Early parts of 'shortcut' compilation

---
 src/nnet3/nnet-computation.h     |  6 +--
 src/nnet3/nnet-optimize-utils.cc | 45 +++++++++++++++++++
 src/nnet3/nnet-optimize-utils.h  | 68 ++++++++++++++++++++++++++++-
 src/nnet3/nnet-optimize.cc       | 48 ++++++++++++++++----
 src/nnet3/nnet-optimize.h        | 75 ++++++++++++++++++++++++--------
 5 files changed, 211 insertions(+), 31 deletions(-)

diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 0d0b13547bf..a5f8cc2aca7 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -91,9 +91,9 @@ struct IoSpecification {
   void Swap(IoSpecification *other);
 
   void Read(std::istream &istream, bool binary);
-  
+
   void Write(std::ostream &ostream, bool binary) const;
-  
+
   bool operator== (const IoSpecification &other) const;
 };
 
@@ -147,7 +147,7 @@ struct ComputationRequest {
   void Read(std::istream &istream, bool binary);
 
   void Write(std::ostream &ostream, bool binary) const;
-  
+
   bool operator== (const ComputationRequest &other) const;
 };
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index df7f975db86..75e5b34bfb7 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -1829,6 +1829,23 @@ void DerivativeTimeLimiter::PruneMatrices() {
     LimitMatrices(will_limit);
 }
 
+
+int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
+  int32 ans = std::numeric_limits<int32>::min();
+  for (size_t i = 0; i < request.outputs.size(); i++) {
+    std::vector<Index> indexes &indexes = request.outputs[i].indexes;
+    std::vector<Index> indexes::const_iterator iter = indexes.begin(),
+        end = indexes.end();
+    for (; iter != end; ++iter)
+      if (iter.t > ans)
+        ans = iter.t;
+  }
+  if (ans == std::numeric_limits<int32>::min()) {
+    KALDI_ERR << "Failed to find any output indexes in computation request.";
+  }
+  return ans;
+}
+
 void LimitDerivativeTimes(const Nnet &nnet,
                           int32 min_deriv_time,
                           int32 max_deriv_time,
@@ -1838,5 +1855,33 @@ void LimitDerivativeTimes(const Nnet &nnet,
   limiter.LimitDerivTimes();
 }
 
+// This class implements the internals of the ExpandComputation() function.
+class ComputationExpander {
+ public:
+  ComputationExpander(const Computation &computation,
+                      bool need_debug_info,
+                      int32 num_n_values,
+                      Computation *expanded_computation):
+      computation_(computation),
+      need_debug_info_(need_debug_info),
+      num_n_values_(num_n_values),
+      expanded_computation_(expanded_computation) { }
+
+  // This function call implements the functionality of the class,
+  // expanding the computation.
+  bool Expand();
+
+ private:
+
+  const Computation &computation_;
+  bool need_debug_info_;
+  int32 num_n_values_;
+  Computation *expanded_computation_;
+
+
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index d82867252ec..84697407a1e 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -522,6 +522,12 @@ class DerivativeTimeLimiter {
   std::vector<MatrixPruneInfo> prune_info_;
 };
 
+
+// This utility function, used in code that calls LimitDerivativeTimes(), returns
+// the largest time 't' in any of the 'outputs' in the computation request,
+// or crashes if there are no outputs (or no cindexes in those outputs).
+int32 MaxOutputTimeInRequest(const ComputationRequest &request);
+
 // This is the top-level interface to limit the times on which derivatives are
 // computed (e.g. for truncated BPTT); internally it uses class
 // DerivativeLimiter.  Will do nothing if min_deriv_time and max_deriv_time are
@@ -532,6 +538,67 @@ void LimitDerivativeTimes(const Nnet &nnet,
                           NnetComputation *computation);
 
 
+/**  This function, used in 'shortcut' compilation where we first compile a
+     smaller computation with the same structure but only 2 distinct 'n'
+     values, works out whether a computation is 'decomposable'; if so,
+     it returns true and outputs the 'mini_request' with the same structure,
+     and the number of 'n' values.
+
+     A computation is decomposable if the following conditions hold:
+
+      - All of its inputs and outputs contain 'n' values for all 0 <= n < N,
+        for some N > 2.  [we output this 'N' as 'num_n_values'].
+      - All of its inputs and outputs have 'regular' structure.
+
+        What it means for an input or output (i.e. an IoSpecification) to have a
+        'regular' structure, is as follows:
+          - The 't' and 'x' values present are the same for each 'n',
+          - The order in which the indexes appear is EITHER of the following:
+             - The 'n' varies the most rapidly, i.e. the order is:
+                 (t1,x1,0), (t1,x1,1) ... (t1,x1,N-1) \
+                 (t2,x2,0), (t2,x2,1) ... (t2,x2,N-1)  ...
+             - The 'n' varies the least rapidly, i.e. the order is:
+                 (t1,x1,0), (t2,x2,0) ...  \
+                 (t1,x1,1), (t2,x2,1) ...  \
+                 ...                       \
+                 (t1,x2,N-1), (t2,x2,N-1) ...
+            In either case, there does not have to be any particular rhyme or
+            reason to the order of the t and x values, the regularity on 'n' is
+            all that we care about.
+ */
+bool ComputationIsDecomposable(const ComputationRequest &request,
+                               ComputationRequest *mini_request,
+                               int32 *num_n_values);  // TODO: implement this.
+
+
+/**
+  This function is used in 'shortcut' compilation to expand a computation
+  that has been compiled for exactly 2 'n' values, to one that is suitable
+  for some num_n_values > 2.
+     @param [in] computation  The computation that was compiled for exactly
+                              2 'n' values (n=0 and n=1)
+     @param [in] need_debug_info True if we want to retain the 'debug_info'
+                              in the output 'expanded_computation'.  In any
+                              case, the 'debug_info' is required in the
+                              input computation.
+     @param [in] num_n_values The number of 'n' values we want in the output
+                              computation
+     @param [out] expanded_computation  The expanded computation.
+
+     @return  This function returns true if it succeeded, and false if it
+              could not expand the computation for some reason (e.g. there
+              was some non-simple component where the 'PrecomputedIndexes'
+              object could not be suitably expanded.  If it returns false,
+              the output 'expanded_computation' is undefined (may contain junk).
+ */
+bool ExpandComputation(const Computation &computation,
+                       bool need_debug_info,
+                       int32 num_n_values,
+                       Computation *expanded_computation);
+
+
+
+
 /// This function detects submatrices, matrices, and members of indexes_multi
 /// and indexes that are never used (e.g. due to changes made in other
 /// optimization code), and removes them from the computation by way of suitable
@@ -655,4 +722,3 @@ void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
 
 
 #endif
-
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 08a28e22025..9d6ff739768 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -52,7 +52,15 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &min_deriv_time);
   ExpectToken(is, binary, "<MaxDerivTime>");
   ReadBasicType(is, binary, &max_deriv_time);
-  ExpectToken(is, binary, "</NnetOptimizeOptions>");
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<MaxDerivTimeRelative>") {
+    ReadBasicType(is, binary, &max_deriv_time_relative);
+    ReadToken(is, binary, &tok);
+  }
+
+
+  KALDI_ASSERT(tok == "</NnetOptimizeOptions>");
 }
 
 void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
@@ -83,6 +91,8 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, min_deriv_time);
   WriteToken(os, binary, "<MaxDerivTime>");
   WriteBasicType(os, binary, max_deriv_time);
+  WriteToken(os, binary, "<MaxDerivTimeRelative>");
+  WriteBasicType(os, binary, max_deriv_time_relative);
   WriteToken(os, binary, "</NnetOptimizeOptions>");
 }
 
@@ -99,7 +109,8 @@ bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const {
           other.move_sizing_commands == move_sizing_commands &&
           other.allocate_from_other == allocate_from_other &&
           other.min_deriv_time == min_deriv_time &&
-          other.max_deriv_time == max_deriv_time);
+          other.max_deriv_time == max_deriv_time &&
+          other.max_deriv_time_relative == max_deriv_time_relative);
 }
 
 // move commands that resize matrices to as late/early as possible.
@@ -413,10 +424,16 @@ void Optimize(const NnetOptimizeOptions &config,
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, request, *computation, true);
 
-  // this will do nothing unless --min-deriv-time or --max-deriv-time was
-  // set.
-  LimitDerivativeTimes(nnet, config.min_deriv_time, config.max_deriv_time,
-                       computation);
+  { // Call LimitDerivativeTimes().
+    // this will do nothing unless --min-deriv-time or --max-deriv-time
+    // or --max-deriv-time-relative was set.
+    int32 max_deriv_time = config.max_deriv_time;
+    if (config.max_deriv_time_relative != std::numeric_limits<int32>::max())
+      max_deriv_time = config.max_deriv_time_relative +
+          MaxOutputTimeInRequest(request);
+    LimitDerivativeTimes(nnet, config.min_deriv_time,
+                         max_deriv_time, computation);
+  }
 
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, request, *computation, true);
@@ -478,11 +495,26 @@ size_t ComputationRequestHasher::operator() (const ComputationRequest *cr) const
 
 size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spec) const {
   size_t ans;
+  size_t n = 19;  // this value is used to extract only a subset of elements to hash;
+                  // it makes the hasher faster.
   StringHasher string_hasher;
   ans = string_hasher(spec.name);
   std::vector<Index>::const_iterator itr = spec.indexes.begin(),
-                                     end = spec.indexes.end();
-  for (; itr != end; ++itr) {
+      end = spec.indexes.end(),
+      med = end;
+  if (med > itr + n)
+    med = iter + n;
+
+  for (; itr != med; ++itr) {
+    ans += (*itr).n * 1619;
+    ans += (*itr).t * 15649;
+    ans += (*itr).x * 89809;
+  }
+  // after the first 'n' values, look only at every n'th value.  this makes the
+  // hashing much faster, and in the kinds of structures that we actually deal
+  // with, we shouldn't get unnecessary hash collisions as a result of this
+  // optimization.
+  for (; iter < end; itr += n) {
     ans += (*itr).n * 1619;
     ans += (*itr).t * 15649;
     ans += (*itr).x * 89809;
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index e04aff302c9..732f11e29ac 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -29,7 +29,7 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Options class for optimizing a NnetComputation The main projected use for
+// Options class for optimizing a NnetComputation.  The main projected use for
 // this is in debugging the optimization code itself, so that if an error is
 // detected, we can work out which optimization was responsible for the error.
 struct NnetOptimizeOptions {
@@ -46,20 +46,23 @@ struct NnetOptimizeOptions {
   bool allocate_from_other;
   int32 min_deriv_time;
   int32 max_deriv_time;
-
-  NnetOptimizeOptions(): optimize(true),
-                         consolidate_model_update(true),
-                         propagate_in_place(true),
-                         backprop_in_place(true),
-                         convert_addition(true),
-                         remove_assignments(true),
-                         allow_left_merge(true),
-                         allow_right_merge(true),
-                         initialize_undefined(true),
-                         move_sizing_commands(true),
-                         allocate_from_other(true),
-                         min_deriv_time(std::numeric_limits<int32>::min()),
-                         max_deriv_time(std::numeric_limits<int32>::max()) { }
+  int32 max_deriv_time_relative;
+
+  NnetOptimizeOptions():
+      optimize(true),
+      consolidate_model_update(true),
+      propagate_in_place(true),
+      backprop_in_place(true),
+      convert_addition(true),
+      remove_assignments(true),
+      allow_left_merge(true),
+      allow_right_merge(true),
+      initialize_undefined(true),
+      move_sizing_commands(true),
+      allocate_from_other(true),
+      min_deriv_time(std::numeric_limits<int32>::min()),
+      max_deriv_time(std::numeric_limits<int32>::max()),
+      max_deriv_time_relative(std::numeric_limits<int32>::max()) {}
 
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "
@@ -99,6 +102,12 @@ struct NnetOptimizeOptions {
                    "the maximum t value that you want derivatives to be computed "
                    "at when updating the model.  This is an optimization that "
                    "saves time in the backprop phase for recurrent frameworks");
+    opts->Register("max-deriv-time-relative", &max_deriv_time_relative,
+                   "An alternative mechanism for setting the --max-deriv-time, "
+                   "suitable for situations where the length of the egs is "
+                   "variable.  If set, it is equivalent to setting the "
+                   "--max-deriv-time to this value plus the largest 't' value "
+                   "in any 'output' node of the computation request.");
   }
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &os, bool binary) const;
@@ -130,20 +139,47 @@ struct ComputationRequestPtrEqual {
   }
 };
 
+
+
+struct CachingOptimizingCompilerOptions {
+  bool use_shortcut;
+  int32 write_cache;
+  int32 cache_capacity;
+
+
+
+  CachingOptimizingCompilerOptions():
+      use_shortcut(true),
+      cache_capacity(64) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("use-shortcut", &use_shortcut,
+                   "If true, use the 'shortcut' in compilation whereby "
+                   "computation requests with regular structure are identified "
+                   "as such, a computation with a smaller number of distinct "
+                   "values of 'n' is compiled (e.g. 2), and the compiled "
+                   "computation is expanded to match the size of the real "
+                   "computation request.");
+    opts->Register("cache-capacity", &cache_capacity,
+                   "Determines how many computations the computation-cache will "
+                   "store (most-recently-used).");
+  }
+};
+
 /// This class enables you to do the compilation and optimization in one call,
 /// and also ensures that if the ComputationRequest is identical to the previous
 /// one, the compilation process is not repeated.
 class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
-                           const int32 capacity = 20):
-      nnet_(nnet), cache_capacity_(capacity) { }
+                            const CachingOptimizingCompilerOptions &config):
+      nnet_(nnet), config_(config), cache_capacity_(capacity) { }
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
-                            const int32 capacity = 20):
-      nnet_(nnet), opt_config_(opt_config), cache_capacity_(capacity) { }
+                            const CachingOptimizingCompilerOptions &config):
+      nnet_(nnet), config_(config), opt_config_(opt_config) { }
 
   ~CachingOptimizingCompiler();
   /// Does the compilation and returns a const pointer to
@@ -155,6 +191,7 @@ class CachingOptimizingCompiler {
   void WriteCache(std::ostream &os, bool binary) const;
  private:
   const Nnet &nnet_;
+  CachingOptimizingCompilerOptions config_;
   NnetOptimizeOptions opt_config_;
 
   // The access queue for keeping track of the freshness of computation.

From f16da00d85507c5a301e72fb0f80110299cff183 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Tue, 4 Oct 2016 20:12:46 -0400
Subject: [PATCH 005/213] Cosmetic changes in nnet3 code

---
 src/nnet3/nnet-compile.cc       | 2 +-
 src/nnet3/nnet-compile.h        | 1 -
 src/nnet3/nnet-computation.cc   | 2 --
 src/nnet3/nnet-compute.h        | 5 ++---
 src/nnet3/nnet-optimize-utils.h | 5 ++++-
 5 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 42ca5d7a83e..7e30b6c7004 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -132,7 +132,7 @@ void Compiler::ComputeDerivNeeded(
 
     unordered_set<int32>::iterator iter = input_steps.begin(),
         end = input_steps.end();
-    // if some step that we depends on needs a derivative, we need the derivative.
+    // if some step that we depend on needs a derivative, we need the derivative.
     for (; iter != end; ++iter) {
       int32 dep_step = *iter;
       KALDI_ASSERT(dep_step < step);
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index 2d187bb6876..4dda38ae723 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -100,7 +100,6 @@ class Compiler {
   // this sets up cindex_id_to_location_.
   void CreateLocationInfo(const std::vector<std::vector<int32> > &by_step);
 
-
   // Computes the set of step-indexes of preceding steps that this step depends
   // on.  Assumes CreateLocationInfo() has already been called.  Requires
   // 'step_index' only to handle a special case, that if 'this_step' is a
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index ba56f5080e8..bb6cf0dd68e 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -1,7 +1,5 @@
 // nnet3/nnet-computation.cc
 
-// nnet3/nnet-computation.cc
-
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 //                2015  Xiaohui Zhang
 
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index abf7a0df12c..d1c28e8bd7c 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -77,9 +77,8 @@ class NnetComputer {
                    CuMatrix<BaseFloat> *input);
 
   /// This function calls AcceptInput() in turn on all the inputs in the
-  /// training example (provide example.io; this interface makes it easy to work
-  /// with CCTC examples too).  It needs "nnet" only in order to distinguish
-  /// inputs from outputs.
+  /// training example.  It needs "nnet" only in order to distinguish inputs
+  /// from outputs.
   void AcceptInputs(const Nnet &nnet,
                     const std::vector<NnetIo> &io);
 
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 84697407a1e..641e31d96b3 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -139,7 +139,10 @@ struct NnetOptimizeOptions;  // Forward declaration.
    automatically detect that there are duplicate submatrices, and will merge
    them, as well as removing the now-unused matrix indexes.  After merging, we
    will mark the variables (i.e. row-ranges) underlying s1 and s2 as being
-   "dirty" so they can no longer be merged during the lifetime of this class.
+   "dirty" so they can no longer be merged during the lifetime of this class--
+   this is so we don't have to think to hard; we apply this optimization
+   multiple times until it makes no change (see
+   nnet-optimize.cc:VariableMerginOptimization()).
  */
 class VariableMergingOptimizer {
  public:

From 06bd75a4ff3a0e8f70c2b836c848eca626073a99 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Sat, 8 Oct 2016 01:20:33 -0400
Subject: [PATCH 006/213] Some code refactoring that will make it easier to
 implement online recognition in nnet3.  Not fully debugged.

---
 src/nnet3/nnet-am-decodable-simple.cc        |   2 +-
 src/nnet3/nnet-analyze.cc                    | 230 ++++++------
 src/nnet3/nnet-analyze.h                     |  31 +-
 src/nnet3/nnet-chain-diagnostics.cc          |  12 +-
 src/nnet3/nnet-chain-training.cc             |   8 +-
 src/nnet3/nnet-compile.cc                    | 162 ++++++---
 src/nnet3/nnet-compile.h                     |  39 +-
 src/nnet3/nnet-computation.cc                | 127 ++++---
 src/nnet3/nnet-computation.h                 |  59 +++-
 src/nnet3/nnet-compute-test.cc               |  23 +-
 src/nnet3/nnet-compute.cc                    | 265 ++++++--------
 src/nnet3/nnet-compute.h                     |  87 +++--
 src/nnet3/nnet-derivative-test.cc            |  14 +-
 src/nnet3/nnet-diagnostics.cc                |   4 +-
 src/nnet3/nnet-discriminative-diagnostics.cc |  28 +-
 src/nnet3/nnet-discriminative-training.cc    |  26 +-
 src/nnet3/nnet-optimize-test.cc              |  16 +-
 src/nnet3/nnet-optimize-utils.cc             | 353 +++++--------------
 src/nnet3/nnet-optimize-utils.h              | 120 ++-----
 src/nnet3/nnet-optimize.cc                   |  93 ++++-
 src/nnet3/nnet-optimize.h                    |   9 +
 src/nnet3/nnet-training.cc                   |  12 +-
 src/nnet3/online-nnet3-decodable-simple.cc   |   2 +-
 23 files changed, 827 insertions(+), 895 deletions(-)

diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index bc851790a05..9116c9461ac 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -261,7 +261,7 @@ void DecodableNnetSimple::DoNnetComputation(
     ivector_feats_cu.Row(0).CopyFromVec(ivector);
     computer.AcceptInput("ivector", &ivector_feats_cu);
   }
-  computer.Forward();
+  computer.Run();
   CuMatrix<BaseFloat> cu_output;
   computer.GetOutputDestructive("output", &cu_output);
   // subtract log-prior (divide by prior)
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 3f04732848c..2176837a7d9 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -278,8 +278,7 @@ void ComputeCommandAttributes(
     switch (c.command_type) {
       case kAllocMatrixZeroed:
       case kAllocMatrixFromOtherZeroed:
-        vars.AppendVariablesForMatrix(c.arg1, &attr.variables_written);
-        attr.matrices_written.push_back(c.arg1);
+        vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
         break;
       case kAllocMatrixUndefined: // nothing is written here.
       case kDeallocMatrix: // ditto.
@@ -372,6 +371,14 @@ void ComputeCommandAttributes(
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
       }
+      case kAcceptInput: {
+        vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
+        break;
+      }
+      case kProvideOutput: {
+        vars.RecordAccessForSubmatrix(c.arg1, kReadAccess, &attr);
+        break;
+      }
       case kNoOperation:
       case kNoOperationMarker:
         break;
@@ -478,68 +485,65 @@ void ComputeMatrixAccesses(
             Access(c, kWriteAccess));
       }
     }
-    // Now set up allocate_command and deallocate_command.
+    // Now set up allocate_command, deallocate_command,
+    // is_input and is_output.
     const NnetComputation::Command &command = computation.commands[c];
-    int32 matrix_index = command.arg1,
-        matrix_index2 = command.arg2;
+    int32 matrix_index1, matrix_index2;
+
 
     switch (command.command_type) {
       case kAllocMatrixZeroed:
       case kAllocMatrixUndefined:
-        if ((*matrix_accesses)[matrix_index].allocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " initialized twice.";
-        (*matrix_accesses)[matrix_index].allocate_command = c;
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        if ((*matrix_accesses)[matrix_index1].allocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index1 << " initialized twice.";
+        (*matrix_accesses)[matrix_index1].allocate_command = c;
         break;
       case kAllocMatrixFromOther:
       case kAllocMatrixFromOtherZeroed:
-        if ((*matrix_accesses)[matrix_index].allocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " initialized twice.";
-        (*matrix_accesses)[matrix_index].allocate_command = c;
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        KALDI_ASSERT(computation.IsWholeMatrix(command.arg2));
+        matrix_index2 = computation.submatrices[command.arg2].matrix_index;
+        if ((*matrix_accesses)[matrix_index1].allocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index1 << " initialized twice.";
+        (*matrix_accesses)[matrix_index1].allocate_command = c;
         if ((*matrix_accesses)[matrix_index2].deallocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " destroyed twice.";
+          KALDI_ERR << "Matrix " << matrix_index2 << " destroyed twice.";
         (*matrix_accesses)[matrix_index2].deallocate_command = c;
         break;
       case kDeallocMatrix:
-        if ((*matrix_accesses)[matrix_index].deallocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " destroyed twice.";
-        (*matrix_accesses)[matrix_index].deallocate_command = c;
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        if ((*matrix_accesses)[matrix_index1].deallocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index1 << " destroyed twice.";
+        (*matrix_accesses)[matrix_index1].deallocate_command = c;
+        break;
+      case kAcceptInput:
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        (*matrix_accesses)[matrix_index1].is_input = true;
+        // If a certain matrix is accepted as input multiple times, we
+        // count the first one as allocating it (the second will just
+        // allocate it again, which is harmless).
+        if ((*matrix_accesses)[matrix_index1].allocate_command == -1)
+          (*matrix_accesses)[matrix_index1].allocate_command = c;
+        break;
+      case kProvideOutput:
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        (*matrix_accesses)[matrix_index1].is_output = true;
         break;
       default:
         ;
     }
   }
-  // now set up the is_input and is_output fields.
-  unordered_map<int32, std::pair<int32, int32> >::const_iterator
-      iter = computation.input_output_info.begin(),
-      end = computation.input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 node_index = iter->first,
-        value_matrix_index = iter->second.first,
-        deriv_matrix_index = iter->second.second;
-    KALDI_ASSERT(value_matrix_index > 0 && value_matrix_index < num_matrices);
-    if (nnet.IsInputNode(node_index)) {
-      // the assert checks for repeats
-      KALDI_ASSERT(!(*matrix_accesses)[value_matrix_index].is_input);
-      (*matrix_accesses)[value_matrix_index].is_input = true;
-      if (deriv_matrix_index != 0) {
-        // the derivatives, if requested, would be outputs of the computation,
-        // even though the node is an input node.
-        KALDI_ASSERT(!(*matrix_accesses)[deriv_matrix_index].is_output);
-        (*matrix_accesses)[deriv_matrix_index].is_output = true;
-      }
-    } else {
-      KALDI_ASSERT(nnet.IsOutputNode(node_index));
-      // the assert checks for repeats
-      KALDI_ASSERT(!(*matrix_accesses)[value_matrix_index].is_output);
-      (*matrix_accesses)[value_matrix_index].is_output = true;
-      if (deriv_matrix_index != 0) {
-        // the derivatives, if provided, would be inputs to the computation,
-        // even though the node is an output node.
-        KALDI_ASSERT(!(*matrix_accesses)[deriv_matrix_index].is_input);
-        (*matrix_accesses)[deriv_matrix_index].is_input = true;
-      }
-    }
-  }
 }
 
 
@@ -575,8 +579,7 @@ void ComputationChecker::CheckComputationRewrite() const {
   int32 num_variables = a_.variable_accesses.size();
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
-    int32 matrix_index = a_.variables.GetMatrixForVariable(v);
-    if (accesses.empty() && ! a_.matrix_accesses[matrix_index].is_input) {
+    if (accesses.empty()) {
       KALDI_ERR << "Variable " << v << " = " << a_.variables.DescribeVariable(v)
                 << "is never used.";
     }
@@ -610,17 +613,13 @@ void ComputationChecker::CheckComputationUndefined() const {
   int32 num_variables = a_.variable_accesses.size();
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
-    int32 matrix_index = a_.variables.GetMatrixForVariable(v);
-    bool is_input = a_.matrix_accesses[matrix_index].is_input;
-    if (! is_input) {
-      if (accesses.empty())
-        KALDI_ERR << "Variable " << v << " == "
-                  << a_.variables.DescribeVariable(v) << "is never used.";
-      if (accesses[0].access_type != kWriteAccess)
-        KALDI_ERR << "Variable " << v << " == "
-                  << a_.variables.DescribeVariable(v)
-                  << "is read before it is written to";
-    }
+    if (accesses.empty())
+      KALDI_ERR << "Variable " << v << " == "
+                << a_.variables.DescribeVariable(v) << "is never used.";
+    if (accesses[0].access_type != kWriteAccess)
+      KALDI_ERR << "Variable " << v << " == "
+                << a_.variables.DescribeVariable(v)
+                << " is read before it is written to";
   }
 }
 
@@ -637,45 +636,35 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
 
   for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
     const MatrixAccesses &accesses = a_.matrix_accesses[matrix_index];
-    if (accesses.is_input) {
-      if (accesses.allocate_command != -1)
-        KALDI_ERR << "Input matrix is initialized.";
-    } else {
-      if (accesses.allocate_command == -1)
-        KALDI_ERR << "Matrix m" << matrix_index << "is not initialized.";
-      if (accesses.accesses.empty()) {
-        KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
-      } else if (accesses.accesses.front().command_index <
-                 accesses.allocate_command) {
-        KALDI_ERR << "Matrix m" << matrix_index << " is accessed before "
-            "it is initialized";
-      }
+    if (accesses.allocate_command == -1)
+      KALDI_ERR << "Matrix m" << matrix_index << "is not initialized.";
+    if (accesses.accesses.empty()) {
+      KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
+    } else if (accesses.accesses.front().command_index <
+               accesses.allocate_command) {
+      KALDI_ERR << "Matrix m" << matrix_index << " is accessed before "
+          "it is initialized";
     }
-    if (accesses.is_output) {
-      if (accesses.deallocate_command != -1)
-        KALDI_ERR << "Output matrix is destroyed.";
-    } else {
-      if (accesses.deallocate_command == -1)
-        KALDI_ERR << "Matrix m" << matrix_index << " is not destroyed.";
-      if (accesses.accesses.empty()) {
-        if (accesses.is_input) {
-          // we allow there to be no accesses if it is an input, e.g. if an
-          // output derivative is supplied for some reason but never used.
-          // We'll warn, though (once).
-          if (!computation_checker_warned_unused_input) {
-            KALDI_WARN << "Matrix m" << matrix_index << " is never accessed. "
-                "Allowing because it is an input (un-needed input or "
-                "derivative?)  Will warn only once.";
-            computation_checker_warned_unused_input = true;
-          }
-        } else {
-          KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
+
+    if (accesses.accesses.empty()) {
+      if (accesses.is_input) {
+        // we allow there to be no accesses if it is an input, e.g. if an
+        // output derivative is supplied for some reason but never used.
+        // We'll warn, though (once).
+        if (!computation_checker_warned_unused_input) {
+          KALDI_WARN << "Matrix m" << matrix_index << " is never accessed. "
+              "Allowing because it is an input (un-needed input or "
+              "derivative?)  Will warn only once.";
+          computation_checker_warned_unused_input = true;
         }
-      } else if (accesses.accesses.back().command_index >=
-                 accesses.deallocate_command) {
-        KALDI_ERR << "Matrix m" << matrix_index << " is accessed after "
-            "it is destroyed";
+      } else {
+        KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
       }
+    } else if (accesses.deallocate_command != -1 &&
+               accesses.accesses.back().command_index >=
+               accesses.deallocate_command) {
+      KALDI_ERR << "Matrix m" << matrix_index << " is accessed after "
+          "it is destroyed";
     }
   }
 }
@@ -687,7 +676,6 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
 */
 void ComputationChecker::CheckComputationIndexes() const {
   int32 num_commands = computation_.commands.size(),
-      num_matrices = computation_.matrices.size(),
       num_submatrices = computation_.submatrices.size();
   const std::vector<NnetComputation::SubMatrixInfo> &submatrices =
       computation_.submatrices;
@@ -698,18 +686,21 @@ void ComputationChecker::CheckComputationIndexes() const {
       case kAllocMatrixZeroed:
       case kAllocMatrixUndefined:
       case kDeallocMatrix:
-        if (c.arg1 < 1 || c.arg1 >= num_matrices)
-          KALDI_ERR << "matrix index out of range.";
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
         break;
       case kAllocMatrixFromOther:
       case kAllocMatrixFromOtherZeroed:
-        if (c.arg1 < 1 || c.arg1 >= num_matrices ||
-            c.arg2 < 1 || c.arg2 >= num_matrices)
-          KALDI_ERR << "matrix index out of range.";
-        if (computation_.matrices[c.arg1].num_rows !=
-            computation_.matrices[c.arg2].num_rows ||
-            computation_.matrices[c.arg1].num_cols !=
-            computation_.matrices[c.arg2].num_cols)
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1) ||
+            c.arg2 < 1 || c.arg2 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg2))
+          KALDI_ERR << "submatrix index out of range or invalid";
+        if (computation_.submatrices[c.arg1].num_rows !=
+            computation_.submatrices[c.arg2].num_rows ||
+            computation_.submatrices[c.arg1].num_cols !=
+            computation_.submatrices[c.arg2].num_cols)
           KALDI_ERR << "Dimension mismatch in kAllocMatrixFromOther* command";
         break;
       case kPropagate: {
@@ -914,6 +905,16 @@ void ComputationChecker::CheckComputationIndexes() const {
         }
         break;
       }
+      case kAcceptInput: case kProvideOutput: {
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
+        // note: we may later change the following condition to allow component
+        // nodes.  we allow it on output node because of derivatives.
+        if (!nnet_.IsInputNode(c.arg2) && !nnet_.IsOutputNode(c.arg2))
+          KALDI_ERR << "Invalid network node";
+        break;
+      }
       case kNoOperation:
       case kNoOperationMarker:
         break;
@@ -1008,9 +1009,6 @@ void ComputeMatrixToSubmatrix(
 
 int32 ComputationAnalysis::FirstAccess(int32 s) const {
   KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
-  int32 matrix_index = computation_.submatrices[s].matrix_index;
-  if (analyzer_.matrix_accesses[matrix_index].is_input)
-    return -1;
   int32 ans = computation_.commands.size();
   std::vector<int32> variable_indexes;
   analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
@@ -1042,8 +1040,6 @@ int32 ComputationAnalysis::FirstAccess(int32 s) const {
 
 int32 ComputationAnalysis::FirstMatrixAccess(int32 m) const {
   KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
-  if (analyzer_.matrix_accesses[m].is_input)
-    return -1;
   int32 ans = computation_.commands.size();
   const std::vector<Access> &accesses =
       analyzer_.matrix_accesses[m].accesses;
@@ -1051,7 +1047,12 @@ int32 ComputationAnalysis::FirstMatrixAccess(int32 m) const {
       access_end = accesses.end();
   for (; access_iter != access_end; ++access_iter) {
     int32 command_index = access_iter->command_index;
-    if (command_index != analyzer_.matrix_accesses[m].allocate_command) {
+    CommandType command_type =
+        computation_.commands[command_index].command_type;
+    if (command_type != kAllocMatrixUndefined &&
+        command_type != kAllocMatrixZeroed &&
+        command_type != kAllocMatrixFromOther &&
+        command_type != kAllocMatrixFromOtherZeroed) {
       ans = std::min(ans, command_index);
       break;  // break from access_iter loop (an optimization)
     }
@@ -1062,8 +1063,6 @@ int32 ComputationAnalysis::FirstMatrixAccess(int32 m) const {
 
 int32 ComputationAnalysis::LastMatrixAccess(int32 m) const {
   KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
-  if (analyzer_.matrix_accesses[m].is_output)
-    return computation_.commands.size();
   int32 ans = -1;
   const std::vector<Access> &accesses =
       analyzer_.matrix_accesses[m].accesses;
@@ -1080,9 +1079,6 @@ int32 ComputationAnalysis::LastMatrixAccess(int32 m) const {
 
 int32 ComputationAnalysis::LastAccess(int32 s) const {
   KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
-  int32 matrix_index = computation_.submatrices[s].matrix_index;
-  if (analyzer_.matrix_accesses[matrix_index].is_output)
-    return computation_.commands.size();
   int32 ans = -1;
   std::vector<int32> variable_indexes;
   analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 28a62e996b8..8b02d6376e9 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -145,6 +145,7 @@ class ComputationVariables {
       int32 matrix_index,
       std::vector<int32> *variable_indexes) const;
 
+
   // Appends to variable_indexes the sorted list of variables corresponding to a
   // submatrix index.
   void AppendVariablesForSubmatrix(
@@ -311,23 +312,20 @@ class ComputationAnalysis {
                       const Analyzer &analyzer): computation_(computation),
                                                  analyzer_(analyzer) { }
 
-  /// If the matrix underlying submatrix 's' is an input then this returns -1;
-  /// otherwise it returns the first command (read or write) that is not an
-  /// allocation command, that accesses any part of 's' [note: deallocation does
-  /// not count as a read or write operation].  If there is no such command, it
-  /// returns num_commands.
+  /// Returns the first command (read or write) that is not a kAlloc* command,
+  /// that accesses any part of 's' [note: deallocation does not count as a read
+  /// or write operation].  If there is no such command, it returns
+  /// num_commands.
   /// s must be >0 (i.e. not the empty submatrix).
   int32 FirstAccess(int32 s) const;
 
-  /// If the matrix underlying submatrix 's' is an output then this returns
-  /// num-commands; otherwise it returns the last non-deallocation command
-  /// that accesses any part of submatrix 's'; if there is no such command it
-  /// returns -1.
+  /// Returns the last non-deallocation command that accesses any part of
+  /// submatrix 's'; if there is no such command it returns -1.
   /// s must be >0 (i.e. not the empty submatrix).
   int32 LastAccess(int32 s) const;
 
   /// Returns the last command-index that accesses any part of submatrix 's' as
-  /// a write operation, or -1 if there is no such operation.  Not: deallocation
+  /// a write operation, or -1 if there is no such operation.  Note: deallocation
   /// does not count as a write operation.
   /// s must be >0 (i.e. not the empty submatrix).
   int32 LastWriteAccess(int32 s) const;
@@ -339,16 +337,13 @@ class ComputationAnalysis {
   /// s must be >0 (i.e. not the empty submatrix).
   int32 DataInvalidatedCommand(int32 c, int32 s) const;
 
-  /// If matrix 'm' is an input then this returns -1; otherwise it returns the
-  /// first command (read or write) that is not an allocation command, that
-  /// accesses any part of 'm' [note: deallocation does not count as a read or
-  /// write operation].  If there is no such command, it returns num_commands.
-  /// m must be >0 (i.e. not the empty matrix).
+  /// Returns the first command (read or write or accept-input) that is not an
+  /// kAllocate* command, that accesses any part of 'm' [note: deallocation does
+  /// not count as a read or write operation].  If there is no such command, it
+  /// returns num_commands.  m must be >0 (i.e. not the empty matrix).
   int32 FirstMatrixAccess(int32 m) const;
 
-
-  /// If matrix 'm' is an output then this returns num-commands; otherwise it
-  /// returns the last non-deallocation command that accesses any part of
+  /// Returns the last non-deallocation command that accesses any part of
   /// matrix 'm'; if there is no such command it returns -1.  m must be >0
   /// (i.e. not the empty matrix).
   int32 LastMatrixAccess(int32 m) const;
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index 46e2b0c01dc..b6b39816337 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -82,10 +82,10 @@ void NnetChainComputeProb::Compute(const NnetChainExample &chain_eg) {
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(nnet_, chain_eg.inputs);
-  computer.Forward();
+  computer.Run();
   this->ProcessOutputs(chain_eg, &computer);
   if (nnet_config_.compute_deriv)
-    computer.Backward();
+    computer.Run();
 }
 
 void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
@@ -111,15 +111,15 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
     if (use_xent)
       xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                         kUndefined);
-      
+
     BaseFloat tot_like, tot_l2_term, tot_weight;
-    
+
     ComputeChainObjfAndDeriv(chain_config_, den_graph_,
                              sup.supervision, nnet_output,
                              &tot_like, &tot_l2_term, &tot_weight,
                              (nnet_config_.compute_deriv ? &nnet_output_deriv :
                               NULL), (use_xent ? &xent_deriv : NULL));
-    
+
     // note: in this context we don't want to apply 'sup.deriv_weights' because
     // this code is used only in combination, where it's part of an L-BFGS
     // optimization algorithm, and in that case if there is a mismatch between
@@ -134,7 +134,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
     totals.tot_l2_term += tot_l2_term;
 
     if (nnet_config_.compute_deriv)
-      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+      computer->AcceptInput(sup.name, &nnet_output_deriv);
 
     if (use_xent) {
       ChainObjectiveInfo &xent_totals = objf_info_[xent_name];
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index d9d43006601..bfc67db17be 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -73,10 +73,10 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
                         *nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, chain_eg.inputs);
-  computer.Forward();
+  computer.Run();
 
   this->ProcessOutputs(chain_eg, &computer);
-  computer.Backward();
+  computer.Run();
 
   UpdateParamsWithMaxChange();
 }
@@ -134,7 +134,7 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
         xent_deriv.MulRowsVec(cu_deriv_weights);
     }
 
-    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+    computer->AcceptInput(sup.name, &nnet_output_deriv);
 
     objf_info_[sup.name].UpdateStats(sup.name, opts_.nnet_config.print_interval,
                                      num_minibatches_processed_++,
@@ -142,7 +142,7 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
 
     if (use_xent) {
       xent_deriv.Scale(opts_.chain_config.xent_regularize);
-      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+      computer->AcceptInput(xent_name, &xent_deriv);
     }
   }
 }
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 7e30b6c7004..ae3073b6265 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -21,6 +21,7 @@
 #include <sstream>
 #include "nnet3/nnet-compile.h"
 #include "nnet3/nnet-compile-utils.h"
+#include "nnet3/nnet-optimize.h"  // just for ConsolidateIoOperations().
 
 namespace kaldi {
 namespace nnet3 {
@@ -51,29 +52,68 @@ void Compiler::CreateComputation(const CompilerOptions &opts,
   ComputeDerivNeeded(steps, &deriv_needed);
   CreateStepInfo(deriv_needed, &steps, computation);
   AddCommands(deriv_needed, computation);
+  // the following command reorders commands so kAcceptInput and kProvideOutput
+  // appear in the desired places.
+  ConsolidateIoOperations(nnet_, computation);
   if (opts.output_debug_info)
     OutputDebugInfo(computation);
 }
 
 void Compiler::AddCommands(const std::vector<bool> &deriv_needed,
                            NnetComputation *computation) {
-  SetInputOutputInfo(computation);
   computation->need_model_derivative = request_.need_model_derivative;
   int32 arbitrary_factor = 8;
   computation->commands.reserve(computation->matrices.size()
                                 * arbitrary_factor);
-  AllocateMatrices(computation);
+
+  std::vector<int32> whole_submatrices;
+  computation->GetWholeSubmatrices(&whole_submatrices);
+  AllocateMatrices(whole_submatrices, computation);
   SetUpPrecomputedIndexes(computation);
   int32 num_steps = steps_.size();
   for (int32 step = 0; step < num_steps; step++)
     DoForwardComputation(step, computation);
-  // mark the end of the forward phase.
-  computation->commands.push_back(
-      NnetComputation::Command(kNoOperationMarker));
+
+  AddCommandsAfterPropagate(deriv_needed, computation);
+
   for (int32 step = num_steps - 1; step >= 0; step--)
     if (deriv_needed[step])
       DoBackwardComputation(step, computation);
-  DeallocateMatrices(computation);
+  DeallocateMatrices(whole_submatrices, computation);
+}
+
+void Compiler::AddCommandsAfterPropagate(const std::vector<bool> &deriv_needed,
+                                         NnetComputation *computation) {
+  // mark the end of the forward phase.
+  computation->commands.push_back(
+      NnetComputation::Command(kNoOperationMarker));
+
+  std::vector<NnetComputation::Command> deriv_input_commands;
+
+  // We handle output nodes here-- add commands that relate to us providing
+  // outputs to the user; then, if applicable, we add commands to direct us to
+  // accept derivatives w.r.t. those outputs from the user.
+  int32 num_steps = steps_.size();
+  for (int32 step = 0; step < num_steps; step++) {
+    const StepInfo &step_info = steps_[step];
+    if (nnet_.IsOutputNode(step_info.node_index)) {
+      int32 node_index = step_info.node_index,
+          submatrix_index = step_info.value;
+      KALDI_ASSERT(computation->IsWholeMatrix(submatrix_index));
+      NnetComputation::Command c(kProvideOutput, submatrix_index, node_index);
+      computation->commands.push_back(c);
+      if (deriv_needed[step]) {
+        int32 deriv_submatrix_index = step_info.deriv;
+        KALDI_ASSERT(deriv_submatrix_index > 0);
+        KALDI_ASSERT(computation->IsWholeMatrix(deriv_submatrix_index));
+        NnetComputation::Command c(kAcceptInput, deriv_submatrix_index, node_index);
+        deriv_input_commands.push_back(c);
+      }
+    }
+  }
+  computation->commands.insert(computation->commands.end(),
+                               deriv_input_commands.begin(),
+                               deriv_input_commands.end());
 }
 
 
@@ -321,37 +361,18 @@ void Compiler::CreateLocationInfo(
   }
 }
 
-void Compiler::SetInputOutputInfo(NnetComputation *computation) const {
-  KALDI_ASSERT(computation->input_output_info.empty());
-  int32 num_steps = steps_.size();
-  for (int32 step = 0; step < num_steps; step++) {
-    const StepInfo &this_info = steps_[step];
-    int32 node_index = this_info.node_index;
-    if (nnet_.IsInputNode(node_index) || nnet_.IsOutputNode(node_index)) {
-      // There should be only one step for each input or output node.
-      KALDI_ASSERT(computation->input_output_info.count(node_index) == 0);
-      int32 value_matrix_index =
-          computation->submatrices[this_info.value].matrix_index;
-      int32 deriv_matrix_index = 0;
-      if (this_info.deriv != 0)
-        deriv_matrix_index =
-            computation->submatrices[this_info.deriv].matrix_index;
-      computation->input_output_info[node_index] =
-          std::pair<int32,int32>(value_matrix_index, deriv_matrix_index);
-    }
-  }
-}
-
-
 void Compiler::DoForwardComputation(int32 step,
                                     NnetComputation *computation) const {
   KALDI_ASSERT(step < static_cast<int32>(steps_.size()));
   const StepInfo &step_info = steps_[step];
   const NetworkNode &node = nnet_.GetNode(step_info.node_index);
   switch (node.node_type) {
-    case kInput: case kDimRange: break;  // Nothing to do.
+    case kInput:  // Note: input nodes appear before other node types.
+      AddForwardStepInput(step, computation);
+      break;
+    case kDimRange: break;  // Nothing to do.
     case kComponent:
-      AddPropagateStep(step, computation);
+      AddForwardStepComponent(step, computation);
       break;
     case kDescriptor:
       DoForwardComputationDescriptor(step, computation);
@@ -757,9 +778,13 @@ void Compiler::DoBackwardComputation(int32 step,
   const NetworkNode &node = nnet_.GetNode(node_index);
 
   switch (node.node_type) {
-    case kInput: case kDimRange: break;  // Nothing to do.
+    case kInput:
+      AddBackwardStepInput(step, computation);
+      break;
+    case kDimRange:
+      break;  // Nothing to do.
     case kComponent:
-      AddBackpropStep(step, computation);
+      AddBackwardStepComponent(step, computation);
       break;
     case kDescriptor:
       DoBackwardComputationDescriptor(step, computation);
@@ -769,9 +794,28 @@ void Compiler::DoBackwardComputation(int32 step,
   }
 }
 
+// This just adds a command of type kAcceptInput that directs the computer to
+// expect input from the user.  Because inputs are always listed first in
+// 'steps', these will precede the actual commands.
+void Compiler::AddForwardStepInput(int32 step,
+                                   NnetComputation *computation) const {
+  KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
+  const StepInfo &step_info = steps_[step];
+  int32 node_index = step_info.node_index,
+      submatrix_index = step_info.value;
+  KALDI_ASSERT(computation->IsWholeMatrix(submatrix_index));
+
+  const NetworkNode &node = nnet_.GetNode(node_index);
+  // actually currently the node type would always be kInput.
+  KALDI_ASSERT(node.node_type == kInput || node.node_type == kComponent);
+
+  NnetComputation::Command c(kAcceptInput, submatrix_index, node_index);
+  computation->commands.push_back(c);
+}
+
 
-void Compiler::AddPropagateStep(int32 step,
-                                NnetComputation *computation) const {
+void Compiler::AddForwardStepComponent(int32 step,
+                                       NnetComputation *computation) const {
   KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
   const StepInfo &step_info = steps_[step];
   int32 input_step = step - 1;
@@ -780,9 +824,6 @@ void Compiler::AddPropagateStep(int32 step,
   const NetworkNode &node = nnet_.GetNode(node_index);
   KALDI_ASSERT(node.node_type == kComponent);
 
-  // in setting the following two variables, we use the fact that the submatrix
-  // index of each submatrix that represents an entire matrix, is the same as
-  // the matrix index of that matrix.
   int32 input_submatrix_index = input_step_info.value,
       output_submatrix_index = step_info.value;
   NnetComputation::Command c(kPropagate,
@@ -804,8 +845,26 @@ void Compiler::AddPropagateStep(int32 step,
 }
 
 
-void Compiler::AddBackpropStep(int32 step,
-                               NnetComputation *computation) const {
+void Compiler::AddBackwardStepInput(int32 step,
+                                    NnetComputation *computation) const {
+  KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
+  const StepInfo &step_info = steps_[step];
+  int32 node_index = step_info.node_index,
+      deriv_submatrix_index = step_info.deriv;
+  if (deriv_submatrix_index == 0)
+    return;  // Nothing to do.
+  KALDI_ASSERT(computation->IsWholeMatrix(deriv_submatrix_index));
+  const NetworkNode &node = nnet_.GetNode(node_index);
+  // actually, currently the node type would always be kInput.
+  KALDI_ASSERT(node.node_type == kInput || node.node_type == kComponent);
+
+  NnetComputation::Command c(kProvideOutput, deriv_submatrix_index, node_index);
+  computation->commands.push_back(c);
+}
+
+
+void Compiler::AddBackwardStepComponent(int32 step,
+                                        NnetComputation *computation) const {
   KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
   const StepInfo &step_info = steps_[step];
   int32 input_step = step - 1;
@@ -816,9 +875,6 @@ void Compiler::AddBackpropStep(int32 step,
   int32 component_index = node.u.component_index;
   const Component *component = nnet_.GetComponent(component_index);
 
-  // in setting the following two variables, we use the fact that the submatrix
-  // index of each submatrix that represents an entire matrix, is the same as
-  // the matrix index of that matrix.
   int32 input_submatrix_index = input_step_info.value,
       output_submatrix_index = step_info.value,
       input_deriv_submatrix_index = input_step_info.deriv,
@@ -844,7 +900,8 @@ void Compiler::AddBackpropStep(int32 step,
 
 
 
-void Compiler::AllocateMatrices(NnetComputation *computation) const {
+void Compiler::AllocateMatrices(const std::vector<int32> &whole_submatrices,
+                                NnetComputation *computation) const {
   KALDI_ASSERT(computation->commands.empty());
   // Work out which matrices are inputs to the computation (or output-derivs,
   // which are also supplied as inputs to the computation); we won't be setting
@@ -873,14 +930,17 @@ void Compiler::AllocateMatrices(NnetComputation *computation) const {
     }
   }
 
-  for (int32 m = 1; m < computation->matrices.size(); m++) {
+  int32 num_matrices = computation->matrices.size();
+  for (int32 m = 1; m < num_matrices; m++) {
     // Later in the optimization phase, it turns out that zeroing is not
     // necessary for some matrices, we'll turn these commands into
     // kAllocMatrixUndefined.
     // We don't set up the matrices that are inputs to the computation;
     // this happens when the user provides the input.
     if (input_and_oderiv_matrices.count(m) == 0) {
-      NnetComputation::Command c(kAllocMatrixZeroed, m);
+      // get a submatrix index that refers to the entire matrix.
+      int32 submatrix_index = whole_submatrices[m];
+      NnetComputation::Command c(kAllocMatrixZeroed, submatrix_index);
       computation->commands.push_back(c);
     }
   }
@@ -926,7 +986,8 @@ void Compiler::SetUpPrecomputedIndexes(
 }
 
 
-void Compiler::DeallocateMatrices(NnetComputation *computation) {
+void Compiler::DeallocateMatrices(const std::vector<int32> &whole_submatrices,
+                                  NnetComputation *computation) {
   // This adds the commands to destroy all the matrices- but not the
   // ones that might be needed as outputs of the computation.  The ones that
   // are spared from destruction are those corresponding to outputs of the
@@ -967,10 +1028,13 @@ void Compiler::DeallocateMatrices(NnetComputation *computation) {
     }
   }
   // note: matrix-index 0 is the empty matrix.
-  for (int32 m = 1; m < num_matrices; m++)
-    if (will_destroy[m])
+  for (int32 m = 1; m < num_matrices; m++) {
+    if (will_destroy[m]) {
+      int32 submatrix_index = whole_submatrices[m];
       computation->commands.push_back(
-          NnetComputation::Command(kDeallocMatrix, m));
+          NnetComputation::Command(kDeallocMatrix, submatrix_index));
+    }
+  }
 }
 
 void Compiler::OutputDebugInfo(NnetComputation *computation) const {
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index 4dda38ae723..195ac36006a 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -60,7 +60,6 @@ class Compiler {
   // multiple commands.
   struct StepInfo {
     int32 node_index;  // network-node index
-    bool is_input;  // true if step corresponds to an input to the computation.
     int32 value;  // sub-matrix index of value that this step outputs.
     int32 deriv;  // sub-matrix index of derivative at the output of this step; zero
                   // if not used (note: index zero is reserved for the empty
@@ -93,8 +92,8 @@ class Compiler {
     // backprop.
     std::vector<std::vector<std::vector<std::pair<int32,int32> > > > input_locations_list;
 
-    StepInfo(): node_index(-1), is_input(false), value(0),
-                deriv(0), precomputed_indexes_index(0) { }
+    StepInfo(): node_index(-1), value(0), deriv(0),
+                precomputed_indexes_index(0) { }
   };
 
   // this sets up cindex_id_to_location_.
@@ -143,17 +142,16 @@ class Compiler {
   // Adds to the computation object the information about the matrix sizes
   void DefineMatrices(NnetComputation *computation) const;
 
-  // sets up the input_output_info of the computation (this says where the
-  // values and derivatives for the inputs and outputs live).
-  void SetInputOutputInfo(NnetComputation *computation) const;
-
   // Sets up sub-matrix indexes for nodes of type Descriptor (needed mainly
   // because Descriptors in general have many parts corresponding to
   // feature-dimension ranges, and they live in sub-matrices.
   void DefineSubmatrices(NnetComputation *computation);
 
   // Adds to the computation object the commands to allocate the matrices.
-  void AllocateMatrices(NnetComputation *computation) const;
+  // 'whole_submatrices' is as created by computation->GetWholeSubmatrices(), it
+  // gives us the index of a submatrix containing the whole of each matrix.
+  void AllocateMatrices(const std::vector<int32> &whole_submatrices,
+                        NnetComputation *computation) const;
 
   // Sets up the precomputed indexes for each component, and sets the
   // precomputed_indexes_index value for each step.
@@ -165,7 +163,11 @@ class Compiler {
 
   // Called from DoForwardComputation, handles the case where the step corresponds
   // to a Component.
-  void AddPropagateStep(int32 step, NnetComputation *computation) const;
+  void AddForwardStepComponent(int32 step, NnetComputation *computation) const;
+
+  // Called from DoForwardComputation, handles the case where the step corresponds
+  // to an input node.
+  void AddForwardStepInput(int32 step, NnetComputation *computation) const;
 
 
   // Called from DoForwardComputation, handles the case where the step
@@ -246,7 +248,12 @@ class Compiler {
 
   // Called from DoBackwardComputation, handles the case where the step corresponds
   // to a Component.
-  void AddBackpropStep(int32 step, NnetComputation *computation) const;
+  void AddBackwardStepComponent(int32 step, NnetComputation *computation) const;
+
+  // Called from DoBackwardComputation, handles the case where the step
+  // corresponds to an input.  If applicable, this generates a command for the
+  // network to provide the derivative w.r.t. the input, to the user.
+  void AddBackwardStepInput(int32 step, NnetComputation *computation) const;
 
   // Called from DoBackwardComputation, handles the case where the step
   // corresponds to type kDescriptor.
@@ -284,11 +291,21 @@ class Compiler {
   // deinitialize all the matrices, except those that may be requested by
   // the user after the computation is done (i.e. outputs of the network,
   // and input derivatives).
-  void DeallocateMatrices(NnetComputation *computation);
+  // 'whole_submatrices' is as created by computation->GetWholeSubmatrices(), it
+  // gives us the index of a submatrix containing the whole of each matrix.
+  void DeallocateMatrices(const std::vector<int32> &whole_submatrices,
+                          NnetComputation *computation);
 
   // sets up the debug_info member of "computation".
   void OutputDebugInfo(NnetComputation *computation) const;
 
+
+  // this function, called from AddCommands, adds the output and input
+  // commands that happen after the forward pass and before the backward
+  // pass.
+  void AddCommandsAfterPropagate(const std::vector<bool> &deriv_needed,
+                                 NnetComputation *computation);
+
   void AddCommands(const std::vector<bool> &deriv_needed,
                    NnetComputation *computation);
 
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index bb6cf0dd68e..bcbb47e5fd8 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -277,6 +277,10 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) {
       command_type = kAddToRowsMulti;
     } else if (command_type_str == "kAddRowRanges") {
       command_type = kAddRowRanges;
+    } else if (command_type_str == "kAcceptInput") {
+      command_type = kAcceptInput;
+    } else if (command_type_str == "kProvideOutput") {
+      command_type = kProvideOutput;
     } else if (command_type_str == "kNoOperation") {
       command_type = kNoOperation;
     } else if (command_type_str == "kNoOperationMarker") {
@@ -362,6 +366,12 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
       case kAddRowRanges:
         os << "kAddRowRanges\n";
         break;
+      case kAcceptInput:
+        os << "kAcceptInput\n";
+        break;
+      case kProvideOutput:
+        os << "kProvideOutput\n";
+        break;
       case kNoOperation:
         os << "kNoOperation\n";
         break;
@@ -492,28 +502,30 @@ static void PrintCommand(std::ostream &os,
   const NnetComputation::Command &c = computation.commands[command_index];
   switch (c.command_type) {
     case kAllocMatrixZeroed:
-      os << "m" << c.arg1 << " = zeros("
-         << computation.matrices[c.arg1].num_rows
-         << ',' << computation.matrices[c.arg1].num_cols << ")\n";
+      os << submatrix_strings[c.arg1] << " = zeros("
+         << computation.submatrices[c.arg1].num_rows
+         << ',' << computation.submatrices[c.arg1].num_cols << ")\n";
       break;
     case kAllocMatrixUndefined:
-      os << "m" << c.arg1 << " = undefined("
-         << computation.matrices[c.arg1].num_rows
-         << ',' << computation.matrices[c.arg1].num_cols << ")\n";
+      os << submatrix_strings[c.arg1] << " = undefined("
+         << computation.submatrices[c.arg1].num_rows
+         << ',' << computation.submatrices[c.arg1].num_cols << ")\n";
       break;
     case kDeallocMatrix:
-      os << "m" << c.arg1 << " = []\n";
+      os << submatrix_strings[c.arg1] << " = []\n";
       break;
     case kAllocMatrixFromOther:
-      os << "m" << c.arg1 << ".swap(m" << c.arg2 << ") [dim = "
-         << computation.matrices[c.arg1].num_rows << " x "
-         << computation.matrices[c.arg1].num_cols << "]\n";
+      os << submatrix_strings[c.arg1] << ".swap("
+         << submatrix_strings[c.arg2] << ") [dim = "
+         << computation.submatrices[c.arg1].num_rows << " x "
+         << computation.submatrices[c.arg1].num_cols << "]\n";
       break;
     case kAllocMatrixFromOtherZeroed:
-      os << "m" << c.arg1 << ".swap(m" << c.arg2 << ") [dim = "
-         << computation.matrices[c.arg1].num_rows << " x "
-         << computation.matrices[c.arg1].num_cols << "]; m"
-         << c.arg1 << ".zero();\n";
+      os << submatrix_strings[c.arg1] << ".swap("
+         << submatrix_strings[c.arg2] << ") [dim = "
+         << computation.submatrices[c.arg1].num_rows << " x "
+         << computation.submatrices[c.arg1].num_cols << "]; "
+         << submatrix_strings[c.arg1] << ".zero();\n";
       break;
     case kPropagate:
       os << nnet.GetComponentName(c.arg1) << ".Propagate(";
@@ -582,6 +594,14 @@ static void PrintCommand(std::ostream &os,
       os << "])\n";
       break;
     }
+    case kAcceptInput:
+      os << submatrix_strings[c.arg1] << " = user input [for node: '"
+         << nnet.GetNodeName(c.arg2) << "']\n";
+      break;
+    case kProvideOutput:
+      os << "output " << submatrix_strings[c.arg1] << " to user"
+         << " [for node: '" << nnet.GetNodeName(c.arg2) << "']\n";
+      break;
     case kNoOperation:
       os << "[no-op]\n";
       break;
@@ -611,20 +631,6 @@ static void PrintComputationPreamble(
       os << ", ";
   }
   os << "\n";
-  // show which matrices the inputs and outputs map to.
-  for (unordered_map<int32, std::pair<int32, int32> >::const_iterator iter =
-           c.input_output_info.begin(); iter != c.input_output_info.end();
-       ++iter) {
-    int32 node_index = iter->first,
-        value_matrix_index = iter->second.first,
-        deriv_matrix_index = iter->second.second;
-    os << nnet.GetNodeName(node_index) << ".value -> m"
-       << value_matrix_index << "\n";
-    if (deriv_matrix_index != 0) {
-      os << nnet.GetNodeName(node_index) << ".deriv -> m"
-         << deriv_matrix_index << "\n";
-    }
-  }
   if (!c.matrix_debug_info.empty()) {
     os << "# The following show how matrices correspond to network-nodes and\n"
        << "# cindex-ids.  Format is: matrix = <node-id>.[value|deriv][ <list-of-cindex-ids> ]\n"
@@ -657,9 +663,25 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const {
 }
 
 void NnetComputation::Read(std::istream &is, bool binary) {
+  int32 version = 2,  // must be in sync with 'version' in Write.
+      version_in = 1;  // defaults to 1 if no version specified.
+
   ExpectToken(is, binary, "<NnetComputation>");
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<Version>") {
+    ReadBasicType(is, binary, &version_in);
+    ExpectToken(is, binary, "<NumMatrices>");
+  } else {
+    KALDI_ASSERT(token == "<NumMatrices>");
+  }
+  if (version_in != version) {
+    KALDI_ERR << "Reading NnetComputation failed because version in "
+              << version_in << " != " << version << "... you can "
+              << "ignore this error if the program continues afterward, "
+              << "it would only affect speed.";
+  }
   size_t num_matrices;
-  ExpectToken(is, binary, "<NumMatrices>");
   ReadBasicType(is, binary, &num_matrices);
   KALDI_ASSERT(num_matrices >= 0);
   matrices.resize(num_matrices);
@@ -738,21 +760,6 @@ void NnetComputation::Read(std::istream &is, bool binary) {
     ReadIntegerPairVector(is, binary, &(indexes_ranges[c]));
   }
 
-  size_t num_input_output_info;
-  ExpectToken(is, binary, "<NumInputOutputInfo>");
-  ReadBasicType(is, binary, &num_input_output_info);
-  KALDI_ASSERT(num_input_output_info >= 0);
-  input_output_info.clear();
-  ExpectToken(is, binary, "<InputOutputInfo>");
-  for (size_t c = 0; c < num_input_output_info; c++) {
-    int32 key;
-    std::pair<int32, int32> val;
-    ReadBasicType(is, binary, &key);
-    ReadBasicType(is, binary, &(val.first));
-    ReadBasicType(is, binary, &(val.second));
-    input_output_info.insert(std::pair<int32, std::pair<int32, int32> >(key, val));
-  }
-
   size_t num_commands;
   ExpectToken(is, binary, "<NumCommands>");
   ReadBasicType(is, binary, &num_commands);
@@ -771,7 +778,10 @@ void NnetComputation::Read(std::istream &is, bool binary) {
 }
 
 void NnetComputation::Write(std::ostream &os, bool binary) const {
+  int32 version = 2;  // Must be in sync with version in Read.
   WriteToken(os, binary, "<NnetComputation>");
+  WriteToken(os, binary, "<Version>");
+  WriteBasicType(os, binary, version);
   WriteToken(os, binary, "<NumMatrices>");
   WriteBasicType(os, binary, matrices.size());
   WriteToken(os, binary, "<Matrices>");
@@ -832,18 +842,6 @@ void NnetComputation::Write(std::ostream &os, bool binary) const {
     WriteIntegerPairVector(os, binary, indexes_ranges[c]);
   }
 
-  if (!binary) os << std::endl;
-  WriteToken(os, binary, "<NumInputOutputInfo>");
-  WriteBasicType(os, binary, input_output_info.size());
-  WriteToken(os, binary, "<InputOutputInfo>");
-  std::map<int32, std::pair<int32, int32> > input_output_info_cp(input_output_info.begin(), input_output_info.end());
-  for (std::map<int32, std::pair<int32, int32> >::const_iterator iter =
-           input_output_info_cp.begin(); iter != input_output_info_cp.end(); ++iter) {
-    WriteBasicType(os, binary, iter->first);
-    WriteBasicType(os, binary, iter->second.first);
-    WriteBasicType(os, binary, iter->second.second);
-  }
-
   if (!binary) os << std::endl;
   WriteToken(os, binary, "<NumCommands>");
   WriteBasicType(os, binary, commands.size());
@@ -1056,7 +1054,6 @@ NnetComputation::NnetComputation(const NnetComputation &other):
     indexes(other.indexes),
     indexes_multi(other.indexes_multi),
     indexes_ranges(other.indexes_ranges),
-    input_output_info(other.input_output_info),
     commands(other.commands),
     need_model_derivative(other.need_model_derivative),
     indexes_cuda(other.indexes_cuda),
@@ -1075,7 +1072,6 @@ NnetComputation& NnetComputation::operator = (const NnetComputation &other) {
     indexes = other.indexes;
     indexes_multi = other.indexes_multi;
     indexes_ranges = other.indexes_ranges;
-    input_output_info = other.input_output_info;
     commands = other.commands;
     need_model_derivative = other.need_model_derivative;
     indexes_cuda = other.indexes_cuda;
@@ -1091,5 +1087,20 @@ NnetComputation& NnetComputation::operator = (const NnetComputation &other) {
     return *this;
 }
 
+
+void NnetComputation::GetWholeSubmatrices(
+    std::vector<int32> *whole_submatrices) const {
+  whole_submatrices->resize(matrices.size(), 0);
+  int32 num_submatrices = submatrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
+    if (IsWholeMatrix(s)) {
+      int32 m = submatrices[s].matrix_index;
+      (*whole_submatrices)[m] = s;
+    }
+  }
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index a5f8cc2aca7..6097b059d23 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -158,14 +158,19 @@ struct ComputationRequest {
    the NnetComputation.  We declare it outside that class because it's so
    frequently used and we got tired of typing NnetComputation:: everywhere.
    We document the commands here.
-
-   - kAllocMatrixUndefined: Allocate a matrix.  arg1 = index of matrix.
-   - kAllocMatrixZeroed: Allocate and zero a matrix.  arg1 = index of matrix.
-   - kDeallocMatrix: Deallocate a matrix.  arg1 = index of matrix.
-   - kAllocMatrixFromOther: initialize matrix indexed arg1 using memory
-   from matrix indexed arg2 (using shallow swap).
-   - kAllocMatrixFromOtherZeroed: initialize matrix indexed arg1 using memory
-     from matrix indexed arg2 (using shallow swap), then zero the matrix
+   Note: for operations that naturally need to operate on entire matrices
+   (i.e. allocation commands and input and output commands), we use the
+   submatrix indexes of them, which turns out to be more convenient for
+   optimization; but these submatrix indexes must refer to the whole of
+   a matrix.
+
+   - kAllocMatrixUndefined: Allocate a matrix.  arg1 = submatrix index.
+   - kAllocMatrixZeroed: Allocate and zero a matrix.  arg1 = submatrix index.
+   - kDeallocMatrix: Deallocate a matrix.  arg1 = submatrix index.
+   - kAllocMatrixFromOther: initialize matrix with submatrix index arg1 using memory
+     from matrix with submatrix index arg2 (using shallow swap).
+   - kAllocMatrixFromOtherZeroed: initialize matrix with submatrix index arg1 using memory
+     from matrix with submatrix index arg2 (using shallow swap), then zero the matrix
      we just allocated.
    - kPropagate: Forward computation of neural net, see Component::Propagate()
      - arg1 is is component-index in neural net
@@ -204,8 +209,19 @@ struct ComputationRequest {
    - kAddRowRanges: call \ref CuMatrix::AddRowRanges() "AddRowRanges()"
      on sub-matrix arg1, with arg2 as source sub-matrix, and indexes given
      indexes_ranges[arg3].
+   - kAcceptInput: accepts a matrix of input from the user, which may be either
+     features, or derivatives w.r.t. the output.  arg1 is the submatrix index of
+     a whole matrix that the input goes to, and arg2 is the index of the network
+     node associated with it (e.g. the node of "input" or "ivector"), for
+     puroses of double checking.
+   - kProvideOutput: outputs a matrix to the user: either a network output, or a
+     matrix of derivatives w.r.t. an input.  arg1 is the submatrix index of the
+     output (which we expect to be a whole matrix), arg2 is the index of the
+     network node associated with it (e.g. the node for "output").
    - kNoOperation: does nothing (sometimes useful during optimization)
-   - kNoOperationMarker: does nothing, but used to mark end of forward commands.
+   - kNoOperationMarker: does nothing, but used to mark end of a block
+     of commands (like forward commands).
+
 */
 enum CommandType {
   kAllocMatrixUndefined, kAllocMatrixZeroed,
@@ -213,7 +229,9 @@ enum CommandType {
   kPropagate, kStoreStats, kBackprop, kBackpropNoModelUpdate,
   kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
   kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
-  kAddRowRanges, kNoOperation, kNoOperationMarker };
+  kAddRowRanges, kAcceptInput, kProvideOutput,
+  kNoOperation, kNoOperationMarker };
+
 
 
 // struct NnetComputation defines the specific steps of a neural-net
@@ -272,9 +290,9 @@ struct NnetComputation {
   };
 
   // "matrices" describes the sizes of the matrices that we use as variables in
-  // the computation [note: index zero is reserved for an empty matrix].  Most
-  // commands refer to submatrices below (note: each matrix will have its own
-  // sub-matrix that just refers to the entire matrix).
+  // the computation [note: index zero is reserved for an empty matrix].  Note:
+  // we generally don't refer to matrices, even ones known to be whole matrices,
+  // using their matrix index directly, but via their submatrix indexes.
   std::vector<MatrixInfo> matrices;
 
   // debug information for each of the matrices (indexed by matrix-index), only
@@ -312,11 +330,11 @@ struct NnetComputation {
   // end-index)
   std::vector<std::vector<std::pair<int32,int32> > > indexes_ranges;
 
-  // Information about where the values and derivatives of inputs and outputs of
-  // the neural net live.  Indexed by the node_index (the same index as used for
-  // the nodes_ array in the Nnet), each pair is (value_matrix_index,
-  // deriv_matrix_index), with 0 for derivatives that are not present.
-  unordered_map<int32, std::pair<int32, int32> > input_output_info;
+//   // Information about where the values and derivatives of inputs and outputs of
+//   // the neural net live.  Indexed by the node_index (the same index as used for
+//   // the nodes_ array in the Nnet), each pair is (value_matrix_index,
+//   // deriv_matrix_index), with 0 for derivatives that are not present.
+//   unordered_map<int32, std::pair<int32, int32> > input_output_info;
 
   // The sequence of commands.
   std::vector<Command> commands;
@@ -369,6 +387,11 @@ struct NnetComputation {
   void GetSubmatrixStrings(const Nnet &nnet,
                            std::vector<std::string> *submat_strings) const;
 
+  // This function outputs a vector, indexed by matrix index, that gives you for
+  // each matrix, the index of a submatrix which refers to the whole of that
+  // matrix (or 0 if there is no such submatrix, which should not happen).
+  void GetWholeSubmatrices(std::vector<int32> *whole_submatrices) const;
+
 
   // This function outputs information similar to Print(), but outputs the
   // preamble as a string and a vector of strings, one per command (with no
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index 6cdde0015f2..f69d4d3036a 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -142,7 +142,7 @@ void UnitTestNnetCompute() {
       KALDI_LOG << "Input sum is " << temp.Sum();
       computer.AcceptInput(request.inputs[i].name, &temp);
     }
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
 
     TestNnetDecodable(request, inputs, nnet, output);
@@ -151,15 +151,16 @@ void UnitTestNnetCompute() {
     CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
     output_deriv.SetRandn();
     // output_deriv sum won't be informative so don't print it.
-    if (request.outputs[0].has_deriv)
-      computer.AcceptOutputDeriv("output", &output_deriv);
-    computer.Backward();
-    for (size_t i = 0; i < request.inputs.size(); i++) {
-      if (request.inputs[i].has_deriv) {
-        const CuMatrixBase<BaseFloat> &in_deriv =
-            computer.GetInputDeriv(request.inputs[i].name);
-        KALDI_LOG << "Input-deriv sum for input '"
-                  << request.inputs[i].name << "' is " << in_deriv.Sum();
+    if (request.outputs[0].has_deriv) {
+      computer.AcceptInput("output", &output_deriv);
+      computer.Run();
+      for (size_t i = 0; i < request.inputs.size(); i++) {
+        if (request.inputs[i].has_deriv) {
+          const CuMatrixBase<BaseFloat> &in_deriv =
+              computer.GetOutput(request.inputs[i].name);
+          KALDI_LOG << "Input-deriv sum for input '"
+                    << request.inputs[i].name << "' is " << in_deriv.Sum();
+        }
       }
     }
   }
@@ -171,7 +172,7 @@ void UnitTestNnetCompute() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  //SetVerboseLevel(2);
+  SetVerboseLevel(4);
 
 
   for (kaldi::int32 loop = 0; loop < 2; loop++) {
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 34f5df523f1..b497e34aac4 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -30,7 +30,7 @@ NnetComputer::NnetComputer(const NnetComputeOptions &options,
                            const Nnet &nnet,
                            Nnet *nnet_to_update):
     options_(options), computation_(computation), nnet_(nnet),
-    nnet_to_update_(nnet_to_update) {
+    program_counter_(0), nnet_to_update_(nnet_to_update) {
   KALDI_ASSERT(computation.indexes_cuda.size() == computation.indexes.size() &&
  computation.indexes_ranges_cuda.size() == computation.indexes_ranges.size() &&
                "You must call NnetComputation::ComputeCudaIndexes() before "
@@ -147,29 +147,37 @@ void NnetComputer::DebugAfterExecute(int32 command,
 
 void NnetComputer::ExecuteCommand(int32 command) {
   const NnetComputation::Command &c = computation_.commands[command];
+  int32 m1, m2;
   try {
     switch (c.command_type) {
       case kAllocMatrixZeroed:
-        matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
-                                 computation_.matrices[c.arg1].num_cols,
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        matrices_[m1].Resize(computation_.matrices[m1].num_rows,
+                                 computation_.matrices[m1].num_cols,
                                  kSetZero,
-                                 computation_.matrices[c.arg1].stride_type);
+                                 computation_.matrices[m1].stride_type);
         break;
       case kAllocMatrixUndefined:
-        matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
-                                 computation_.matrices[c.arg1].num_cols,
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        matrices_[m1].Resize(computation_.matrices[m1].num_rows,
+                                 computation_.matrices[m1].num_cols,
                                  kUndefined,
-                                 computation_.matrices[c.arg1].stride_type);
+                                 computation_.matrices[m1].stride_type);
         break;
       case kDeallocMatrix:
-        matrices_[c.arg1].Resize(0, 0);
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        matrices_[m1].Resize(0, 0);
         break;
       case kAllocMatrixFromOther:
-        matrices_[c.arg1].Swap(&(matrices_[c.arg2]));
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        m2 = computation_.submatrices[c.arg2].matrix_index;
+        matrices_[m1].Swap(&(matrices_[m2]));
         break;
       case kAllocMatrixFromOtherZeroed:
-        matrices_[c.arg1].Swap(&(matrices_[c.arg2]));
-        matrices_[c.arg1].SetZero();
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        m2 = computation_.submatrices[c.arg2].matrix_index;
+        matrices_[m1].Swap(&(matrices_[m2]));
+        matrices_[m1].SetZero();
         break;
       case kPropagate: {
         const Component *component = nnet_.GetComponent(c.arg1);
@@ -352,69 +360,56 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
               reinterpret_cast<CuArray<BaseFloat*>*>(pointers));
 }
 
-void NnetComputer::Forward() {
-  CheckInputs(false);
-  int32 size = computation_.commands.size(), i = 0;
+void NnetComputer::Run() {
   const std::vector<NnetComputation::Command> &c = computation_.commands;
-  CommandDebugInfo info;
-  Timer timer;
-  double total_elapsed_previous = 0.0;
-
-  for (; i < size && c[i].command_type != kNoOperationMarker;
-       i++) {
-    if (debug_)
-      DebugBeforeExecute(i, &info);
-    ExecuteCommand(i);
-    if (debug_) {
-      double total_elapsed_now = timer.Elapsed();
-      DebugAfterExecute(i, info, total_elapsed_now - total_elapsed_previous);
-      total_elapsed_previous = total_elapsed_now;
-    }
-
-  }
-
-}
+  int32 num_commands = c.size();
 
+  if (program_counter_ >= num_commands)
+    KALDI_ERR << "Running computation that has already finished.";
+  CheckNoPendingIo();
 
-void NnetComputer::Backward() {
-  CheckInputs(true);
-  int32 size = computation_.commands.size(), i = 0;
-  const std::vector<NnetComputation::Command> &c = computation_.commands;
-  for (; i < size && c[i].command_type != kNoOperationMarker;
-       i++);
   CommandDebugInfo info;
   Timer timer;
   double total_elapsed_previous = 0.0;
 
-  for (; i < size; i++) {
+  for (; program_counter_ < num_commands; program_counter_++) {
+    if (c[program_counter_].command_type == kAcceptInput ||
+        c[program_counter_].command_type == kProvideOutput) {
+      // We have hit a part of the computation that requires user
+      // interaction, e.g. the end of the forward or backward phase.
+      break;
+    }
     if (debug_)
-      DebugBeforeExecute(i, &info);
-    ExecuteCommand(i);
+      DebugBeforeExecute(program_counter_, &info);
+    ExecuteCommand(program_counter_);
     if (debug_) {
       double total_elapsed_now = timer.Elapsed();
-      DebugAfterExecute(i, info, total_elapsed_now - total_elapsed_previous);
+      DebugAfterExecute(program_counter_, info,
+                        total_elapsed_now - total_elapsed_previous);
       total_elapsed_previous = total_elapsed_now;
     }
   }
 }
 
-void NnetComputer::AcceptInput(const std::string &input_name,
+void NnetComputer::AcceptInput(const std::string &node_name,
                                CuMatrix<BaseFloat> *input) {
-  bool is_output = false, is_deriv = false;
-  int32 matrix_index = GetMatrixIndex(input_name, is_output, is_deriv);
-  KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
+  bool is_output = false;
+  int32 matrix_index = GetIoMatrixIndex(node_name, is_output);
+
   const NnetComputation::MatrixInfo &matrix_info =
       computation_.matrices[matrix_index];
-  if (input->NumRows() != matrix_info.num_rows)
-    KALDI_ERR << "Num-rows mismatch for input '" << input_name
+  if (input->NumRows() != matrix_info.num_rows) {
+    KALDI_ERR << "Num-rows mismatch for input '" << node_name
               << "': " << matrix_info.num_rows
               <<  " in computation-request, " << input->NumRows()
               << " provided.";
-  if (input->NumCols() != matrix_info.num_cols)
-    KALDI_ERR << "Num-cols mismatch for input '" << input_name
+  }
+  if (input->NumCols() != matrix_info.num_cols) {
+    KALDI_ERR << "Num-cols mismatch for input '" << node_name
               << "': " << matrix_info.num_cols
               <<  " in computation-request, " << input->NumCols()
               << " provided.";
+  }
   if (matrix_info.stride_type == kDefaultStride ||
       input->Stride() == input->NumCols()) {
     matrices_[matrix_index].Swap(input);
@@ -423,130 +418,96 @@ void NnetComputer::AcceptInput(const std::string &input_name,
                                    matrix_info.num_cols,
                                    kUndefined, kStrideEqualNumCols);
     matrices_[matrix_index].CopyFromMat(*input);
+    input->Resize(0, 0);
   }
-  input->Resize(0, 0);
 }
 
-const CuMatrixBase<BaseFloat> &NnetComputer::GetInputDeriv(
-    const std::string &input_name) const {
-  bool is_output = false, is_deriv = true;
-  int32 matrix_index = GetMatrixIndex(input_name, is_output, is_deriv);
-  if (matrices_[matrix_index].NumRows() == 0)
-    KALDI_ERR << "GetInputDeriv called before it is ready (before Backward()?)";
-  return matrices_[matrix_index];
-}
-
-
 const CuMatrixBase<BaseFloat> &NnetComputer::GetOutput(
-    const std::string &output_name) const {
-  bool is_output = true, is_deriv = false;
-  int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
-  if (matrices_[matrix_index].NumRows() == 0)
-    KALDI_ERR << "GetOutput called when output not ready (before Forward()?)";
+    const std::string &node_name) {
+  bool is_output = true;
+  int32 matrix_index = GetIoMatrixIndex(node_name, is_output);
+  KALDI_ASSERT(matrices_[matrix_index].NumRows() != 0);
   return matrices_[matrix_index];
 }
 
-void NnetComputer::GetOutputDestructive(
-    const std::string &output_name,
-    CuMatrix<BaseFloat> *output) {
-  bool is_output = true, is_deriv = false;
-  int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
-  if (matrices_[matrix_index].NumRows() == 0)
-    KALDI_ERR << "GetOutput called when output not ready (before Forward()?)";
-  output->Resize(0, 0);
+
+void NnetComputer::GetOutputDestructive(const std::string &node_name,
+                                        CuMatrix<BaseFloat> *output) {
+  bool is_output = true;
+  int32 matrix_index = GetIoMatrixIndex(node_name, is_output);
+  KALDI_ASSERT(matrices_[matrix_index].NumRows() != 0);
   matrices_[matrix_index].Swap(output);
+  matrices_[matrix_index].Resize(0, 0);
 }
 
 
-void NnetComputer::AcceptOutputDeriv(const std::string &output_name,
-                                     CuMatrix<BaseFloat> *output_deriv) {
-  bool is_output = true, is_deriv = true;
-  int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
-  KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
-  const NnetComputation::MatrixInfo &matrix_info =
-      computation_.matrices[matrix_index];
-  if (output_deriv->NumRows() != matrix_info.num_rows)
-    KALDI_ERR << "Num-rows mismatch for output-deriv '" << output_name
-              << "': " << matrix_info.num_rows
-              <<  " in computation-request, " << output_deriv->NumRows()
-              << " provided.";
-  if (output_deriv->NumCols() != matrix_info.num_cols)
-    KALDI_ERR << "Num-cols mismatch for output_deriv '" << output_name
-              << "': " << matrix_info.num_cols
-              <<  " in computation-request, " << output_deriv->NumCols()
-              << " provided.";
-  if (matrix_info.stride_type == kDefaultStride ||
-      output_deriv->Stride() == output_deriv->NumCols()) {
-    matrices_[matrix_index].Swap(output_deriv);
-  } else {
-    matrices_[matrix_index].Resize(matrix_info.num_rows,
-                                   matrix_info.num_cols,
-                                   kUndefined, kStrideEqualNumCols);
-    matrices_[matrix_index].CopyFromMat(*output_deriv);
+void NnetComputer::CheckNoPendingIo() {
+  const std::vector<NnetComputation::Command> &c = computation_.commands;
+  while (program_counter_ < static_cast<int32>(c.size()) &&
+         (c[program_counter_].command_type == kAcceptInput ||
+          c[program_counter_].command_type == kProvideOutput)) {
+    pending_commands_.push_back(program_counter_);
+    program_counter_++;
+  }
+  while (!pending_commands_.empty()) {
+    // the order here doesn't really matter; we go from back to front
+    // as it's more efficient, not that efficiency really matters here.
+    int32 last_command = pending_commands_.back();
+    if (c[last_command].command_type == kProvideOutput) {
+      // we can ignore that we didn't provide output to the user.
+      KALDI_VLOG(3) << "Output to node '" << nnet_.GetNodeName(c[last_command].arg2)
+                    << "' was available but not used.";
+      pending_commands_.pop_back();
+    } else {
+      // we can't ignore if we needed input from the user that hasn't been
+      // provided.
+      KALDI_ASSERT(c[last_command].command_type == kAcceptInput);
+      int32 node = c[last_command].arg2;
+      KALDI_ERR << "Cannot run computation because we did not get input for node '"
+                << nnet_.GetNodeName(node) << "'";
+    }
   }
-  output_deriv->Resize(0, 0);
 }
 
-int32 NnetComputer::GetMatrixIndex(
-    const std::string &node_name, bool is_output, bool is_deriv) const {
+int32 NnetComputer::GetIoMatrixIndex(const std::string &node_name, bool is_output) {
+  const std::vector<NnetComputation::Command> &c = computation_.commands;
   int32 node_index = nnet_.GetNodeIndex(node_name);
   if (node_index == -1)
     KALDI_ERR << "No node named '" << node_name << "'in network.";
-  if (is_output) {
-    if (!nnet_.IsOutputNode(node_index))
-      KALDI_ERR << "Expecting output node; node named '"
-                << node_name  << "' is not output node.";
-  } else {
-    if (nnet_.IsOutputNode(node_index))
-      KALDI_ERR << "Expecting input node or component node; node named '"
-                << node_name  << "' is output node.";
-  }
-  unordered_map<int32, std::pair<int32, int32> >::const_iterator
-      iter = computation_.input_output_info.find(node_index),
-      end = computation_.input_output_info.end();
-  if (iter == end)
-    KALDI_ERR << "Not expecting input or output for node named '" << node_name
-              << "' (not in computation request)";
-  std::pair<int32,int32> locations = iter->second;
-  int32 location;
-  if (is_deriv) {
-    location = locations.second;
-    if (locations.second <= 0) // No deriv expected.
-      KALDI_ERR << "Not expecting derivative information for node named '"
-                << node_name << "' (not in computation request)";
-  } else {
-    location = locations.first;
+  // first make sure all the I/O commands that we immediately expect, are listed
+  // in 'pending_commands_'.
+  while (program_counter_ < static_cast<int32>(computation_.commands.size()) &&
+         (c[program_counter_].command_type == kAcceptInput ||
+          c[program_counter_].command_type == kProvideOutput ||
+          c[program_counter_].command_type == kNoOperationMarker)) {
+    if (c[program_counter_].command_type != kNoOperationMarker)
+      pending_commands_.push_back(program_counter_);
+    program_counter_++;
   }
-  KALDI_ASSERT(static_cast<size_t>(location) < matrices_.size());
-  return location;
-}
-
-void NnetComputer::CheckInputs(bool check_output_deriv) const {
-  unordered_map<int32, std::pair<int32, int32> >::const_iterator
-      iter = computation_.input_output_info.begin(),
-      end = computation_.input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 node_index = iter->first,
-      value_matrix_index = iter->second.first,
-      deriv_matrix_index = iter->second.second;
-    std::string name = nnet_.GetNodeName(node_index);
-    if (nnet_.IsOutputNode(node_index)) {
-      if (check_output_deriv && deriv_matrix_index > 0) {
-        KALDI_ASSERT(static_cast<size_t>(deriv_matrix_index) < matrices_.size());
-        if (matrices_[deriv_matrix_index].NumRows() == 0)
-          KALDI_ERR << "Output-derivative required but not provided for node '"
-                    << name << "'.";
-      }
-    } else {
-      if (!check_output_deriv) {
-        if (matrices_[value_matrix_index].NumRows() == 0)
-          KALDI_ERR << "Input required but not provided for node '"
-                    << name << "'.";
-      }
+  for (size_t i = 0; i < pending_commands_.size(); i++) {
+    int32 command = pending_commands_[i];
+    bool this_command_is_output =
+        (c[command].command_type == kProvideOutput);
+    int32 this_submatrix_index = c[command].arg1,
+        this_node_index = c[command].arg2;
+    if (this_command_is_output == is_output && node_index == this_node_index) {
+      pending_commands_.erase(pending_commands_.begin() + i);
+      if (!(computation_.IsWholeMatrix(this_submatrix_index)))
+        KALDI_ERR << "Getting input or output that is not a whole matrix "
+                  << "(probably some optimization code needs to be changed)";
+      return computation_.submatrices[this_submatrix_index].matrix_index;
     }
   }
+  // if you get the following error it will likely be a bug in the calling code,
+  // or possibly due to giving the wrong egs.
+  KALDI_ERR << "Could not "
+            << (is_output ? "provide output " : " accept input ")
+            << "for network node " << node_name
+            << " (it is not expected at this point in the computation)";
+  return 0;  // Suppress compiler warnings; this line will never be reached.
 }
 
+
 void NnetComputer::AcceptInputs(const Nnet &nnet,
                                 const std::vector<NnetIo> &io_vec) {
   for (size_t i = 0; i < io_vec.size(); i++) {
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index d1c28e8bd7c..32839755828 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -53,8 +53,8 @@ struct NnetComputeOptions {
   "computation" object.
 
   You call in sequence, the constructor, then AcceptInput() [or AcceptInputs()],
-  then Forward(), then GetOutput(), then if applicable (Backward(), then if
-  applicable GetInputDeriv()).
+  then Run(), then GetOutput() [and if applicable, AcceptOutputDeriv], then if
+  there is a backward computation, Run() [then, if applicable, GetInputDeriv()].
  */
 class NnetComputer {
  public:
@@ -67,52 +67,55 @@ class NnetComputer {
                const Nnet &nnet,
                Nnet *nnet_to_update);
 
-  /// e.g. AcceptInput ("input", input_mat).  Will crash if there is no
-  /// input node with the given name.  This function is destructive of "input"
-  /// as it takes it using the Swap function of CuMatrix.
-  /// Must have the same number of rows as the corresponding input described
-  /// in the ComputationRequest e.g. the indexes.size() in the corresponding
+  /// e.g. AcceptInput ("input", &input_mat), or for derivatives w.r.t. the
+  /// output, AcceptInput("output", output_deriv_mat).  Will crash if there is
+  /// no input or output node with the given name.  This function is destructive
+  /// of "input" as it takes it using the Swap function of CuMatrix.  Must have
+  /// the same number of rows as the corresponding input described in the
+  /// ComputationRequest e.g. the indexes.size() in the corresponding
   /// IoSpecification.
-  void AcceptInput(const std::string &input_name,
+  void AcceptInput(const std::string &node_name,
                    CuMatrix<BaseFloat> *input);
 
-  /// This function calls AcceptInput() in turn on all the inputs in the
-  /// training example.  It needs "nnet" only in order to distinguish inputs
+  /// This convenience function calls AcceptInput() in turn on all the inputs in
+  /// the training example.  It needs "nnet" only in order to distinguish inputs
   /// from outputs.
   void AcceptInputs(const Nnet &nnet,
                     const std::vector<NnetIo> &io);
 
 
-  // Does the forward computation.
-  void Forward();
+  /// This does either the forward or backward computation, depending
+  /// when it is called (in a typical computation, the first time you call
+  /// this it will do the forward computation; then you'll take the outputs
+  /// and provide derivatives; and the second time you call it, it will do
+  /// the backward computation.  There used to be two separate functions
+  /// Forward() and Backward().
+  void Run();
 
-  // e.g. GetOutput ("output").  Will crash if no such output.
-  const CuMatrixBase<BaseFloat> &GetOutput(const std::string &output_name) const;
+  // e.g. GetOutput("output").  This function can also be used to get
+  // derivatives w.r.t. inputs.  It's non-const because it may only
+  // be called once and it keeps track of that.
+  const CuMatrixBase<BaseFloat> &GetOutput(const std::string &node_name);
 
   // Version of GetOutput that calls Swap(), destroying the output stored inside
   // this object.  You should probably not use this if you plan to call
-  // Backward() on the same NnetComputer object, it may lead to a crash.
+  // Backward() on the same NnetComputer object, or it's a recurret
+  // computation-- it may lead to a crash.
   void GetOutputDestructive(const std::string &output_name,
                             CuMatrix<BaseFloat> *output);
 
-  /// e.g. AcceptOutputDeriv("output", &output_deriv_mat).
-  void AcceptOutputDeriv(const std::string &output_name,
-                         CuMatrix<BaseFloat> *output_deriv);
-
-
-  // Does the backward computation.
-  void Backward();
-
-  // e.g. GetInputDeriv ("input").  Will crash if no such input derivative.
-  // You may only call this if you requested this input derivative in the
-  // ComputationRequest.
-  const CuMatrixBase<BaseFloat> &GetInputDeriv(
-      const std::string &input_name) const;
 
  private:
   const NnetComputeOptions &options_;
   const NnetComputation &computation_;
   const Nnet &nnet_;
+  int32 program_counter_;  // command index to execute next.
+  // To deal with inputs and outputs that are not provided/taken by the user in
+  // the same order as listed in the computation, pending_commands_ contains a
+  // list of program commands that were skipped over but are in the queue to be
+  // executed.
+  std::vector<int32> pending_commands_;
+
   Nnet *nnet_to_update_;
   bool debug_;
   // command_attributes_ is only used if debug_=true.
@@ -125,15 +128,26 @@ class NnetComputer {
   // The matrices used in the computation.
   std::vector<CuMatrix<BaseFloat> > matrices_;
 
+
   // executes the command in computation_.commands[command].
   void ExecuteCommand(int32 command);
 
-  // Returns the matrix index where the input or output matrix index for
-  // "node_name" is stored (or its corresponding derivative, if is_deriv==true).
-  // "is_output" tells the code that this is an output node, as opposed to an
-  // input node; it's used only for checking.
-  int32 GetMatrixIndex(const std::string &node_name,
-                       bool is_output, bool is_deriv) const;
+  // Returns the matrix index where the input (if is_output==false) or output
+  // matrix index for "node_name" is stored.  This looks at the next command (at
+  // program_counter_) and in pending_commands_, and sees whether we were
+  // expecting any input or output for this node, and if there is a match,
+  // returns it and "consumes" the command by either advancing program_counter_
+  // or consuming something from pending_commands_.
+  // If there is not a match (i.e. we were not expecting this type of I/O
+  // at this point in the computation), it prints an error and dies.
+  int32 GetIoMatrixIndex(const std::string &node_name, bool is_output);
+
+
+  // This function, called from Run(), checks that there is no pending I/O
+  // that we were waiting for, that would block the running of the
+  // computation; it crashes if there was pending input, and ignores and
+  // skips over any pending output.
+  void CheckNoPendingIo();
 
   CuSubMatrix<BaseFloat> GetSubMatrix(int32 submatrix_index);
 
@@ -144,11 +158,6 @@ class NnetComputer {
                    int32 num_cols,
                    CuArray<const BaseFloat*> *pointers);
 
-  // with check_output_deriv = false, checks we have all inputs.
-  // with check_output_deriv = true, checks we have all required output-derivs.
-  void CheckInputs(bool check_output_deriv) const;
-
-
   struct CommandDebugInfo {
     // Uncentered standard deviations of elements of all matrices that this
     // command writes.  Dimension is the same as
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index a12ca2ae0af..1f8aa7dcfec 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -198,7 +198,7 @@ void UnitTestNnetModelDerivatives() {
       }
 
       KALDI_LOG << "Running forward computation";
-      computer.Forward();
+      computer.Run();
 
       const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
       KALDI_LOG << "Output sum for pass " << pass << " is " << output.Sum();
@@ -208,9 +208,9 @@ void UnitTestNnetModelDerivatives() {
       if (pass == 0) {
         // we need to do the backward computation (to get the model derivative)
         CuMatrix<BaseFloat> temp(output_deriv);
-        computer.AcceptOutputDeriv("output", &temp);
+        computer.AcceptInput("output", &temp);
         KALDI_LOG << "Running backward computation";
-        computer.Backward();
+        computer.Run();
       } else {
         // work out the predicted objf-change as dot-product of deriv and
         // parameter-change.  The expression below can be interpreted as
@@ -369,7 +369,7 @@ void UnitTestNnetInputDerivatives() {
       }
 
       KALDI_LOG << "Running forward computation";
-      computer.Forward();
+      computer.Run();
 
       const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
       KALDI_LOG << "Output sum for pass " << pass << " is " << output.Sum();
@@ -379,11 +379,11 @@ void UnitTestNnetInputDerivatives() {
       if (pass == 0) {
         // We need to compute the input derivatives.
         CuMatrix<BaseFloat> temp(output_deriv);
-        computer.AcceptOutputDeriv("output", &temp);
+        computer.AcceptInput("output", &temp);
         KALDI_LOG << "Running backward computation";
-        computer.Backward();
+        computer.Run();
         for (size_t i = 0; i < request.inputs.size(); i++) {
-          input_derivs[i] = computer.GetInputDeriv(request.inputs[i].name);
+          input_derivs[i] = computer.GetOutput(request.inputs[i].name);
           KALDI_LOG << "Input-deriv norm for '" << request.inputs[i].name
                     << "' is " << input_derivs[i].FrobeniusNorm();
         }
diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc
index 7f7d485ffe0..e7adeffeb09 100644
--- a/src/nnet3/nnet-diagnostics.cc
+++ b/src/nnet3/nnet-diagnostics.cc
@@ -69,10 +69,10 @@ void NnetComputeProb::Compute(const NnetExample &eg) {
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(nnet_, eg.io);
-  computer.Forward();
+  computer.Run();
   this->ProcessOutputs(eg, &computer);
   if (config_.compute_deriv)
-    computer.Backward();
+    computer.Run();
 }
 
 void NnetComputeProb::ProcessOutputs(const NnetExample &eg,
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
index 10f0811c12e..417a6fa05ac 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.cc
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -73,7 +73,7 @@ void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg)
       use_xent_derivative = false;
 
   ComputationRequest request;
-  GetDiscriminativeComputationRequest(nnet_, eg, 
+  GetDiscriminativeComputationRequest(nnet_, eg,
                                       need_model_derivative,
                                       store_component_stats,
                                       use_xent_regularization, use_xent_derivative,
@@ -83,10 +83,10 @@ void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg)
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(nnet_, eg.inputs);
-  computer.Forward();
+  computer.Run();
   this->ProcessOutputs(eg, &computer);
   if (nnet_config_.compute_deriv)
-    computer.Backward();
+    computer.Run();
 }
 
 void NnetDiscriminativeComputeObjf::ProcessOutputs(
@@ -104,7 +104,7 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
       KALDI_ERR << "Network has no output named " << sup.name;
 
     const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
-    
+
     bool use_xent = (discriminative_config_.xent_regularize != 0.0);
     std::string xent_name = sup.name + "-xent";  // typically "output-xent".
     CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
@@ -112,18 +112,18 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
     if (nnet_config_.compute_deriv)
       nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                                kUndefined);
-    
+
     if (use_xent)
       xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                         kUndefined);
 
     if (objf_info_.count(sup.name) == 0)
-      objf_info_.insert(std::make_pair(sup.name, 
+      objf_info_.insert(std::make_pair(sup.name,
           discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
 
     discriminative::DiscriminativeObjectiveInfo *stats = &(objf_info_[sup.name]);
 
-    discriminative::ComputeDiscriminativeObjfAndDeriv(discriminative_config_, 
+    discriminative::ComputeDiscriminativeObjfAndDeriv(discriminative_config_,
                                                       tmodel_, log_priors_,
                                                       sup.supervision, nnet_output,
                                                       stats,
@@ -132,11 +132,11 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
                                                       (use_xent ? &xent_deriv : NULL));
 
     if (nnet_config_.compute_deriv)
-      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
-    
+      computer->AcceptInput(sup.name, &nnet_output_deriv);
+
     if (use_xent) {
       if (objf_info_.count(xent_name) == 0)
-        objf_info_.insert(std::make_pair(xent_name, 
+        objf_info_.insert(std::make_pair(xent_name,
           discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
       discriminative::DiscriminativeObjectiveInfo &xent_stats = objf_info_[xent_name];
 
@@ -149,7 +149,7 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
       xent_stats.tot_t_weighted += stats->tot_t_weighted;
       xent_stats.tot_objf += xent_objf;
     }
-    
+
     num_minibatches_processed_++;
   }
 }
@@ -168,21 +168,21 @@ bool NnetDiscriminativeComputeObjf::PrintTotalStats() const {
     BaseFloat tot_weight = info.tot_t_weighted;
     BaseFloat tot_objective = info.TotalObjf(
         discriminative_config_.criterion);
-    
+
     info.PrintAll(discriminative_config_.criterion);
 
     if (info.tot_l2_term == 0.0) {
       KALDI_LOG << "Overall " << discriminative_config_.criterion
                 << " objective for '"
                 << name << "' is "
-                << (tot_objective / tot_weight) 
+                << (tot_objective / tot_weight)
                 << " per frame, "
                 << "over " << tot_weight << " frames.";
     } else {
       KALDI_LOG << "Overall " << discriminative_config_.criterion
                 << " objective for '"
                 << name << "' is "
-                << (tot_objective / tot_weight) 
+                << (tot_objective / tot_weight)
                 << " + " << (info.tot_l2_term / tot_weight)
                 << " per frame, "
                 << "over " << tot_weight << " frames.";
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index 865056f3569..15c91d5c23b 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -57,7 +57,7 @@ NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
       KALDI_WARN << "Could not open cached computation. "
                     "Probably this is the first training iteration.";
     }
-  } 
+  }
   log_priors_.ApplyLog();
 }
 
@@ -79,10 +79,10 @@ void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
                         (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
-  computer.Forward();
+  computer.Run();
 
   this->ProcessOutputs(eg, &computer);
-  computer.Backward();
+  computer.Run();
 
   if (delta_nnet_ != NULL) {
     BaseFloat scale = (1.0 - nnet_config.momentum);
@@ -126,7 +126,7 @@ void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &
     CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
                                           nnet_output.NumCols(),
                                           kUndefined);
-    
+
     bool use_xent = (opts_.discriminative_config.xent_regularize != 0.0);
     std::string xent_name = sup.name + "-xent";  // typically "output-xent".
     CuMatrix<BaseFloat> xent_deriv;
@@ -140,14 +140,14 @@ void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &
       objf_info_[sup.name].stats.Configure(opts_.discriminative_config);
       objf_info_[sup.name].stats.Reset();
     }
-    
-    ComputeDiscriminativeObjfAndDeriv(opts_.discriminative_config, 
+
+    ComputeDiscriminativeObjfAndDeriv(opts_.discriminative_config,
                                       tmodel_, log_priors_,
                                       sup.supervision, nnet_output,
-                                      &stats, 
+                                      &stats,
                                       &nnet_output_deriv,
                                       (use_xent ? &xent_deriv : NULL));
-    
+
     if (use_xent) {
       // this block computes the cross-entropy objective.
       const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(xent_name);
@@ -175,16 +175,16 @@ void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &
         xent_deriv.MulRowsVec(cu_deriv_weights);
     }
 
-    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+    computer->AcceptInput(sup.name, &nnet_output_deriv);
 
     objf_info_[sup.name].UpdateStats(sup.name, opts_.discriminative_config.criterion,
                                      opts_.nnet_config.print_interval,
                                      num_minibatches_processed_++,
                                      stats);
-    
+
     if (use_xent) {
       xent_deriv.Scale(opts_.discriminative_config.xent_regularize);
-      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+      computer->AcceptInput(xent_name, &xent_deriv);
     }
   }
 }
@@ -251,11 +251,11 @@ bool DiscriminativeObjectiveFunctionInfo::PrintTotalStats(const std::string &nam
 
 NnetDiscriminativeTrainer::~NnetDiscriminativeTrainer() {
   delete delta_nnet_;
-  
+
   if (opts_.nnet_config.write_cache != "") {
     Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
     compiler_.WriteCache(ko.Stream(), opts_.nnet_config.binary_write_cache);
-  } 
+  }
 }
 
 
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 8fa1ef87e36..4d61f6f9f4a 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -117,9 +117,9 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
       computer_opt.AcceptInput(request.inputs[i].name, &temp2);
     }
     KALDI_LOG << "Running non-optimized forward computation";
-    computer.Forward();
+    computer.Run();
     KALDI_LOG << "Running optimized forward computation";
-    computer_opt.Forward();
+    computer_opt.Run();
 
     const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
     KALDI_LOG << "Output sum (not optimized) is " << output.Sum();
@@ -136,20 +136,20 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     CuMatrix<BaseFloat> output_deriv_opt(output_deriv);
 
     if (request.outputs[0].has_deriv) {
-      computer.AcceptOutputDeriv("output", &output_deriv);
-      computer_opt.AcceptOutputDeriv("output", &output_deriv_opt);
+      computer.AcceptInput("output", &output_deriv);
+      computer_opt.AcceptInput("output", &output_deriv_opt);
     }
 
     KALDI_LOG << "Running non-optimized backward computation";
-    computer.Backward();
+    computer.Run();
     KALDI_LOG << "Running optimized backward computation";
-    computer_opt.Backward();
+    computer_opt.Run();
     for (size_t i = 0; i < request.inputs.size(); i++) {
       if (request.inputs[i].has_deriv) {
         const CuMatrixBase<BaseFloat> &in_deriv =
-            computer.GetInputDeriv(request.inputs[i].name);
+            computer.GetOutput(request.inputs[i].name);
         const CuMatrixBase<BaseFloat> &in_deriv_opt =
-            computer_opt.GetInputDeriv(request.inputs[i].name);
+            computer_opt.GetOutput(request.inputs[i].name);
         KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
                   << "' (non-optimized) is " << in_deriv.Sum();
         KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 75e5b34bfb7..c84cb82733f 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -33,8 +33,12 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kAllocMatrixZeroed:
     case kAllocMatrixUndefined:
     case kDeallocMatrix:
+      submatrix_args->push_back(&c->arg1);
+      break;
     case kAllocMatrixFromOther:
     case kAllocMatrixFromOtherZeroed:
+      submatrix_args->push_back(&c->arg1);
+      submatrix_args->push_back(&c->arg2);
       break;
     case kPropagate:
       submatrix_args->push_back(&c->arg3);
@@ -64,6 +68,9 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kCopyToRowsMulti:
       submatrix_args->push_back(&c->arg1);
       break;
+    case kAcceptInput: case kProvideOutput:
+      submatrix_args->push_back(&c->arg1);
+      break;
     case kNoOperation:
     case kNoOperationMarker:
       break;
@@ -87,40 +94,13 @@ void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
 }
 
 
-void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *commands,
-                        std::vector<int32*> *matrix_args) {
-  matrix_args->clear();
-  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
-      end = commands->end();
-  std::vector<int32*> this_matrix_args;
-  for (; iter != end; ++iter) {
-    IdentifyMatrixArgs(&(*iter), &this_matrix_args);
-    matrix_args->insert(matrix_args->end(),
-                        this_matrix_args.begin(),
-                        this_matrix_args.end());
-  }
-}
-
 
-void IdentifyMatrixArgsInComputation(bool include_in_submatrices,
-                                     NnetComputation *computation,
+void IdentifyMatrixArgsInComputation(NnetComputation *computation,
                                      std::vector<int32*> *matrix_args) {
-  IdentifyMatrixArgs(&(computation->commands), matrix_args);
   int32 num_submatrices = computation->submatrices.size();
-  matrix_args->reserve(matrix_args->size() +
-                       (include_in_submatrices ?
-                        computation->submatrices.size() : 0) +
-                       2 * computation->input_output_info.size());
-  if (include_in_submatrices)
-    for (int32 s = 1; s < num_submatrices; s++)
-      matrix_args->push_back(&(computation->submatrices[s].matrix_index));
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    matrix_args->push_back(&(iter->second.first));
-    matrix_args->push_back(&(iter->second.second));
-  }
+  matrix_args->reserve(computation->submatrices.size());
+  for (int32 s = 1; s < num_submatrices; s++)
+    matrix_args->push_back(&(computation->submatrices[s].matrix_index));
 }
 
 
@@ -167,25 +147,6 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 
 
 
-void IdentifyMatrixArgs(NnetComputation::Command *c,
-                        std::vector<int32*> *matrix_args) {
-  matrix_args->clear();
-  switch (c->command_type) {
-    case kAllocMatrixZeroed:
-    case kAllocMatrixUndefined:
-    case kDeallocMatrix:
-      matrix_args->push_back(&c->arg1);
-      break;
-    case kAllocMatrixFromOther:
-    case kAllocMatrixFromOtherZeroed:
-      matrix_args->push_back(&c->arg1);
-      matrix_args->push_back(&c->arg2);
-      break;
-    default:
-      break;
-  }
-}
-
 // static
 int32 ComputationRenumberer::CreateRenumbering(
     const std::vector<bool> &used,
@@ -276,22 +237,10 @@ void ComputationRenumberer::ComputeMatrixIsUsed() {
   matrix_is_used_.clear();
   matrix_is_used_.resize(computation_->matrices.size(), false);
   matrix_is_used_[0] = true;
-
-  std::vector<int32*> matrix_args;
-  bool include_in_submatrices = false;
-  IdentifyMatrixArgsInComputation(include_in_submatrices,
-                                  computation_, &matrix_args);
-  std::vector<int32*>::iterator iter = matrix_args.begin(),
-      end = matrix_args.end();
-  for (; iter != end; ++iter) {
-    int32 matrix_index = **iter;
-    if (matrix_index > 0)
-      matrix_is_used_[matrix_index] = true;
-  }
   // We also need to take into account when matrices are used indirectly via
   // submatrices (which is actually the main way they are accessed).
-  int32 num_submatrices_orig = computation_->submatrices.size();
-  for (int32 s = 1; s < num_submatrices_orig; s++) {
+  int32 num_submatrices = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
     int32 matrix_index = computation_->submatrices[s].matrix_index;
     if (submatrix_is_used_[s])
       matrix_is_used_[matrix_index] = true;
@@ -355,20 +304,15 @@ void ComputationRenumberer::RenumberSubmatrices() {
 
 void ComputationRenumberer::RenumberMatrices() {
   std::vector<int32*> matrix_args;
-  bool include_in_submatrices = true;
-  IdentifyMatrixArgsInComputation(include_in_submatrices,
-                                  computation_, &matrix_args);
-  std::vector<int32*>::iterator iter = matrix_args.begin(),
-      end = matrix_args.end();
-  for (; iter != end; ++iter) {
-    if (**iter > 0) {
-      int32 new_matrix_index = old_to_new_matrix_[**iter];
-      // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
-      // submatrices that are never accessed, and these should never appear
-      // in this list.
-      KALDI_ASSERT(new_matrix_index > 0);
-      **iter = new_matrix_index;
-    }
+  int32 num_submatrices = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
+    int32 *matrix_index = &(computation_->submatrices[s].matrix_index);
+    // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
+    // submatrices that are never accessed, and these should never appear
+    // in this list.  (presumably because we renumber the submatrices first).
+    int32 new_matrix_index = old_to_new_matrix_[*matrix_index];
+    KALDI_ASSERT(new_matrix_index > 0);
+    *matrix_index = new_matrix_index;
   }
 
   std::vector<NnetComputation::MatrixInfo> new_matrices;
@@ -615,80 +559,6 @@ void RemoveNoOps(NnetComputation *computation) {
   computation->commands.resize(output_iter - computation->commands.begin());
 }
 
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet,
-    int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation) {
-  bool ans = false;
-  int32 num_matrices = computation->matrices.size();
-  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
-               new_matrix_index > 0 && new_matrix_index < num_matrices);
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 network_node = iter->first,
-        &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    if (nnet.IsOutputNode(network_node)) {
-      // deriv_matrix_index would be an input to the computation.
-      if (deriv_matrix_index == orig_matrix_index) {
-        deriv_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    } else {
-      // value_matrix_index would be an input to the computation.
-      if (value_matrix_index == orig_matrix_index) {
-        value_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    }
-  }
-  return ans;
-}
-
-
-/// Wherever matrix orig_matrix_index appears in the output of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInOutput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation) {
-  bool ans = false;
-  int32 num_matrices = computation->matrices.size();
-  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
-               new_matrix_index > 0 && new_matrix_index < num_matrices);
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 network_node = iter->first,
-        &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    if (nnet.IsOutputNode(network_node)) {
-      // value_matrix_index would be an output of the computation.
-      if (value_matrix_index == orig_matrix_index) {
-        value_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    } else {
-      // deriv_matrix_index would be an output of the computation.
-      if (deriv_matrix_index == orig_matrix_index) {
-        // we'd only have derivatives for actual inputs. [note: we also allow
-        // users to provide inputs for component nodes, but these would not have
-        // derivatives.]
-        KALDI_ASSERT(nnet.IsInputNode(network_node));
-        deriv_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    }
-  }
-  return ans;
-}
-
 
 VariableMergingOptimizer::VariableMergingOptimizer(
     const NnetOptimizeOptions &config,
@@ -747,10 +617,10 @@ bool VariableMergingOptimizer::MergeVariables() {
     if (s1 > 0 && s2 > 0) {
       std::pair<bool,bool> p = MayBeMerged(command_index, s1, s2);
       if (p.first) {
-        DoLeftMerge(command_index, s1, s2);
+        DoMerge(command_index, s1, s2);
         merged = true;
       } else if (p.second) {
-        DoRightMerge(command_index, s1, s2);
+        DoMerge(command_index, s2, s1);
         merged = true;
       }
     }
@@ -800,45 +670,33 @@ void VariableMergingOptimizer::MarkAsDirty(int32 s) {
   }
 }
 
-void VariableMergingOptimizer::DoRightMerge(int32 command_index,
-                                            int32 s1, int32 s2) {
-  // Prevent further optimizations touching s1 or s2 (we can
-  // try again in a later round of optimization, with a new
-  // instance of this class).
-  MarkAsDirty(s1);
-  MarkAsDirty(s2);
-
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
-  { // modify submatrices for submatrices of m1 to effectively be sub-matrices of
-    // s2 instead (they will refer to m2 as the matrix_index).
-    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m1].begin(),
-        end = matrix_to_submatrix_[m1].end();
+void VariableMergingOptimizer::DoMerge(int32 command_index,
+                                       int32 s_to_keep,
+                                       int32 s_to_discard) {
+  // Prevent further optimizations touching either submatrix (we can try again
+  // in a later round of optimization, with a new instance of this class).
+  MarkAsDirty(s_to_keep);
+  MarkAsDirty(s_to_discard);
+
+  int32 m_to_keep = computation_->submatrices[s_to_keep].matrix_index,
+      m_to_discard = computation_->submatrices[s_to_discard].matrix_index;
+  KALDI_ASSERT(m_to_keep != m_to_discard && m_to_keep > 0 && m_to_discard > 0);
+
+  { // modify submatrices of m_to_discard to effectively be sub-matrices of
+    // s_to_keep instead (they will refer to m_to_keep as the matrix_index).
+    std::vector<int32>::const_iterator iter =
+        matrix_to_submatrix_[m_to_discard].begin(),
+        end = matrix_to_submatrix_[m_to_discard].end();
     for (; iter != end; ++iter) {
       int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m1);
+      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index
+                   == m_to_discard);
       computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s2);
+          GetSubMatrixOfSubMatrix(*computation_, submatrix_index,
+                                  s_to_keep);
     }
   }
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  // - If m1 was an input, replace it as an input with m2
-  bool replaced = ReplaceInInput(nnet_, m1, m2, computation_);
-  KALDI_ASSERT(replaced == matrix_accesses[m1].is_input);
-  if (replaced) {  // Remove the command that allocates m2.
-    int32 alloc_command = matrix_accesses[m2].allocate_command;
-    KALDI_ASSERT(alloc_command != -1);
-    computation_->commands[alloc_command].command_type =
-        kNoOperation;
-  }
-  // we keep matrix m2 (so m2 is m_to_keep, m1 is m_to_discard).
-  DoMergeCommon(command_index, m2, m1);
-}
 
-void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
-                                             int32 m_to_keep,
-                                             int32 m_to_discard) {
   ComputationAnalysis analysis(*computation_, analyzer_);
   NnetComputation::Command &c = computation_->commands[command_index];
   const std::vector<MatrixAccesses> &matrix_accesses =
@@ -852,52 +710,59 @@ void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
     c.arg2 = -1;
   }
 
-  //   - If both m_to_keep and m_to_discard have commands that deallocate them,
-  //    keep only the allocation command for m_to_keep, and make sure it's after
-  //    the last access of m_to_discard (otherwise delete any deallocation
-  //    command).
+  //   We want to ensure that there is only one deallocation command.
+  //   If neither matrix is an output, then there will be 2 deallocation
+  //   commands and we keep the one for m_to_keep (which, if the sizes
+  //   differ, will be the larger of the two, so it's the one whose
+  //   submatrix index refers to the entirety of the matrix).
+  //   If one of them is an output, then remove the deallocation command
+  //   of whichever one is not an output.
+  //   As a simplification to the logic above: if the 'discard' matrix
+  //   has a deallocation command (i.e. if that matrix was not an output)
+  //   then remove it; otherwise remove the deallocation command of
+  //   the 'keep' matrix.
+
   int32 dealloc_keep = matrix_accesses[m_to_keep].deallocate_command,
       dealloc_discard = matrix_accesses[m_to_discard].deallocate_command;
-  if (dealloc_keep != -1 && dealloc_discard != -1) {
-    KALDI_ASSERT(analysis.LastMatrixAccess(m_to_discard) < dealloc_keep);
+  if (dealloc_discard != -1) {
     computation_->commands[dealloc_discard].command_type = kNoOperation;
   } else {
-    if (dealloc_keep != -1)
-      computation_->commands[dealloc_keep].command_type =
-          kNoOperation;
-    if (dealloc_discard != -1)
-      computation_->commands[dealloc_discard].command_type =
-          kNoOperation;
-  }
-
-  //   - If both m_to_keep and m_to_discard have commands that allocate them,
-  //     keep only the allocation command for m_to_keep and make sure it's
-  //     before the first access of m_to_discard.
-  //     (otherwise delete any allocation command).
-  int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
-      alloc_discard = matrix_accesses[m_to_discard].allocate_command;
-  if (alloc_keep != -1 && alloc_discard != -1) {
+    KALDI_ASSERT(dealloc_keep != -1);
+    computation_->commands[dealloc_keep].command_type = kNoOperation;
+  }
+
+  {
+    //   - Both m_to_keep and m_to_discard will have commands that allocate
+    //     them, as all matrices do (note, kAcceptInput counts as an allocation
+    //     command).  If one of them is kAcceptInput, then delete the other one.
+    //     Otherwise delete the "discard" one.  As a simplification of the logic
+    //     of the previous sentence: if the "discard" allocate command is
+    //     kAcceptInput then delete the "keep" allocate command, else delete
+    //     the "discard" allocate command.
+    //     Note: after we renumber the submatrices, they both refer to the
+    //     same underlying matrix, but we need to refer to them using a
+    //     submatrix that refers to the entire matrix.  The one we keep will
+    //     always refer to the entire matrix.  (In the case where one of
+    //     them is an input, both submatrices are guaranteed to refer to the
+    //     entire matrix).
+    int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
+        alloc_discard = matrix_accesses[m_to_discard].allocate_command;
+
+    KALDI_ASSERT(alloc_keep != -1 && alloc_discard != -1);
     KALDI_ASSERT(analysis.FirstMatrixAccess(m_to_discard) > alloc_keep);
+
     NnetComputation::Command
         &keep_alloc_command = computation_->commands[alloc_keep],
         &discard_alloc_command = computation_->commands[alloc_discard];
-    discard_alloc_command.command_type = kNoOperation;
-    if (keep_alloc_command.command_type == kAllocMatrixUndefined) {
-      keep_alloc_command.command_type = kAllocMatrixZeroed;
-    } else if (keep_alloc_command.command_type == kAllocMatrixFromOther) {
-      keep_alloc_command.command_type = kAllocMatrixFromOtherZeroed;
+    if (discard_alloc_command.command_type == kAcceptInput) {
+      keep_alloc_command.command_type = kNoOperation;
+    } else {
+      discard_alloc_command.command_type = kNoOperation;
     }
-  } else {
-    if (alloc_keep != -1)
-      computation_->commands[alloc_keep].command_type =
-          kNoOperation;
-    if (alloc_discard != -1)
-      computation_->commands[alloc_discard].command_type =
-          kNoOperation;
   }
 
   //  If the matrix to discard had stride_type == kStrideEqualNumCols, set the
-  //  matrix to keep's stride_type to kStrideEqualNuMCols.
+  //  matrix to keep's stride_type to kStrideEqualNumCols.
   if (computation_->matrices[m_to_discard].stride_type == kStrideEqualNumCols) {
     computation_->matrices[m_to_keep].stride_type = kStrideEqualNumCols;
     // ... and perform an additional check.
@@ -908,43 +773,6 @@ void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
   }
 }
 
-void VariableMergingOptimizer::DoLeftMerge(int32 command_index,
-                                           int32 s1, int32 s2) {
-  // Prevent further optimizations touching s1 or s2 (we can
-  // try again in a later round of optimization, with a new
-  // instance of this class).
-  MarkAsDirty(s1);
-  MarkAsDirty(s2);
-
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
-  { // modify submatrices for submatrices of m2 to effectively be sub-matrices of
-    // s1 instead (they will refer to m1 as the matrix_index).
-    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m2].begin(),
-        end = matrix_to_submatrix_[m2].end();
-    for (; iter != end; ++iter) {
-      int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m2);
-      computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s1);
-    }
-  }
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  // - If m2 was an output, replace it as an input with m1.
-  bool replaced = ReplaceInOutput(nnet_, m2, m1, computation_);
-  KALDI_ASSERT(replaced == matrix_accesses[m2].is_output);
-  if (replaced) {  // Remove the command that deallocates m1.
-    int32 dealloc_command = matrix_accesses[m1].deallocate_command;
-    KALDI_ASSERT(dealloc_command != -1);
-    computation_->commands[dealloc_command].command_type =
-        kNoOperation;
-  }
-  // we keep matrix m1 (so m1 is m_to_keep, m2 is m_to_discard).
-  DoMergeCommon(command_index, m1, m2);
-}
-
-
 
 
 std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
@@ -1067,14 +895,14 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
   int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols,
                                                       stride_type);
   // Add a command at the very start, to initialize this new matrix.
-  int32 new_matrix_index =
-      computation_->submatrices[new_whole_submatrix].matrix_index;
   // we can later on optimize this zeroed initialization to an undefined
   // initialization.
   extra_commands_[0].push_back(
-      NnetComputation::Command(kAllocMatrixZeroed, new_matrix_index));
+      NnetComputation::Command(kAllocMatrixZeroed, new_whole_submatrix));
   final_deallocate_commands_.push_back(
-      NnetComputation::Command(kDeallocMatrix, new_matrix_index));
+      NnetComputation::Command(kDeallocMatrix, new_whole_submatrix));
+  int32 new_matrix_index =
+      computation_->submatrices[new_whole_submatrix].matrix_index;
   if (!computation_->matrix_debug_info.empty())
     computation_->matrix_debug_info[new_matrix_index].Swap(&debug_info);
 
@@ -1311,6 +1139,7 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
       MapAddRowRangesCommand(command);
       break;
     }
+    case kAcceptInput: case kProvideOutput:
     case kNoOperation: case kNoOperationMarker:
       break;
     default:
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 641e31d96b3..b833a4cafe9 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -52,14 +52,11 @@ struct NnetOptimizeOptions;  // Forward declaration.
    may be sub-matrices of larger matrices.
 
    Note: the following
-     - Define last-access(submatrix) as:
-       If matrix-of(submatrix) is an output, then num-commands, otherwise the
+     - Define last-access(submatrix) as the
        last command that accesses that submatrix for either read or write.  [note:
        deallocation does not count as a read or write operation].
-     - Define first-access(submatrix) as:
-       If matrix-of(submatrix) is an input, then -1, otherwise the first command
-       that is *not* an allocation command that accessed that submatrix for either
-       read or write.
+     - Define first-access(submatrix) as the first command not of type kAlloc*
+       that accessed that submatrix for either read or write.
      - Define last-write-access(submatrix) as the last command-index that accessed
        the submatrix in a write operation, or -1 if there is no such command (this
        could happen for inputs).
@@ -99,41 +96,27 @@ struct NnetOptimizeOptions;  // Forward declaration.
    Otherwise (cases (b) and (c), in-place propagate or backprop), we insist that:
      - first-access(s2) == C
      - last-access(s1) == C
-   Note: in either case, these conditions imply that s2 is not an input and s1 is
-   not an output.
+   Note: in either case, these conditions imply that m2/s2 is not an input and m1/s1 is
+   not an output.  [i.e. s1 *may* be an input and s2 *may* be an output].
+
+   We can explain the procedure for both left-merge and right-merge in one, because
+   it's the same.  Define s_to_keep and m_to_keep as s1 and m1 if we're left-merging
+   and s2 and m2 if we're right-merging, and s_to_discard and m_to_discard the opposite
+   way.
+
+   The procedure to merge in general is as follows:
 
-   The sequence of things we have to do for a right-merge (in which we delete
-   s1,m1) is as follows:
      - All submatrices that reference m1, make them reference m2 instead.
-       [later we'll renumber so that there are no duplicates.]
-     - If m1 was an input, replace it as an input with m2 and remove the
-       command that allocated m2.
-     - If it was an assignment [case (a)], replace the assignment command with a
-       no-op.
-     - If both m1 and m2 have commands that allocate them, keep only the
-       allocation command for m2, and make sure that it zeroes the data (we can
-       later change to undefined if allowed) and that it's before the first
-       non-allocation access of m1.  Otherwise remove any allocation commands
-       (the merged variable is an input).
-     - If both m1 and m2 have commands that deallocate them, keep only the
-       deallocation command for m2, and make sure that it's after the last
-       access of m1 (otherwise delete any deallocation command, because m2 must
-       be an output).  [note: previously we kept the later of the 2 commands,
-       but this had the effect of making inaccurate the Analyzer info for
-       a matrix (m2) that might later be used.
-     - If m1 had stride_type == kStrideEqualNumCols, set m2's stride_type
-       to kStrideEqualNuMCols.
-
-
-   The sequence of things we have to do for a right-merge (in which we delete
-   s1,m1) is as follows:
-     - All submatrices that reference m2, make them reference m1 instead.
-       [later we'll renumber so that there are no duplicates.]
-     - If m2 was an output, replace it as an output with m1 and remove the
-       command that deallocated m1.
-     ... the last four bullet-points, regarding removing the assignment command,
-        and allocation and deallocation, and stride-type, are the same as for a
-        left-merge, except swap m1 and m2.
+       [later we'll renumber so that there are no duplicates.]  This automatically
+       takes care of making the input and output and allocation/deallocation
+       commands refer to the right matrix, in most cases.
+     - We need to get rid of duplicate or unnecessary allocation commands:
+       If m_to_discard is an input then get rid of the allocation command for
+       m_to_keep; otherwise get rid of the allocation command of m_to_discard.
+     - We need to get rid of duplicate or unnecessary deallocation commands:
+       If m_to_discard is an output then get rid of the deallocation command
+       for m_to_keep; otherwise get rid of the deallocation command for
+       m_to_discard.
 
    At the end when we call RemoveOrphanMatrices(), the renumbering code will
    automatically detect that there are duplicate submatrices, and will merge
@@ -173,20 +156,10 @@ class VariableMergingOptimizer {
   ///  @param s2   [in]     A submatrix-index s2 > 0
   std::pair<bool,bool> MayBeMerged(int32 command, int32 s1, int32 s2) const;
 
-  // performs the left merge.  Search for left-merge in the comment
-  // above the class declaration for details.
-  void DoLeftMerge(int32 command_index, int32 s1, int32 s2);
-
-  // performs the right merge.  Search for right-merge in the comment
-  // above the class declaration for details.
-  void DoRightMerge(int32 command_index, int32 s1, int32 s2);
-
-  // Performs the actions common to both left and right merges, regarding
-  // removing the assignment command, and allocation and deallocation (called
-  // from DoLeftMerge and DoRightMerge).  The m_to_keep and m_to_discard
-  // are the matrix-indexes we will keep and discard respectively.
-  void DoMergeCommon(int32 command_index, int32 m_to_keep,
-                     int32 m_to_discard);
+  // Merges to matrices, whether left merge or right merge.  s_to_keep and
+  // s_to_discard are the submatrix-indexes we will keep and discard
+  // respectively (these are s1 and s2 in some order.
+  void DoMerge(int32 command_index, int32 s_to_keep, int32 m_to_discard);
 
   /// Marks the variables underlying submatrix 's' as dirty
   void MarkAsDirty(int32 s);
@@ -612,21 +585,6 @@ void RenumberComputation(NnetComputation *computation);
 /// Removes commands of type kNoOperation in the computation.
 void RemoveNoOps(NnetComputation *computation);
 
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation);
-
-/// A helper function used in some optimization functions.
-/// Wherever matrix orig_matrix_index appears in the output of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInOutput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation);
-
 /// This function outputs to "submatrix_args" the addresses of a subset of
 /// arguments arg1 through arg6 in "command", that correspond to the indexes of
 /// submatrices.  This is useful in renumbering code.  Note: some of the
@@ -653,32 +611,6 @@ void IdentifySubmatrixArgsInComputation(NnetComputation *computation,
                                         std::vector<int32*> *submatrix_args);
 
 
-/// This function outputs to "matrix_args" the addresses of a subset of the
-/// arguments arg1 through arg6 in "command", that correspond to the indexes of
-/// matrices.  This is useful in renumbering code.  (Note: only a few types of
-/// command use matrix indexes).
-void IdentifyMatrixArgs(NnetComputation::Command *command,
-                        std::vector<int32*> *matrix_args);
-
-/// This function outputs to "matrix_args" the addresses of a subset of the
-/// arguments arg1 through arg6 in commands in "commands", that correspond to
-/// the indexes of matrices.  This is useful in renumbering code.  (Note: only a
-/// few types of command use matrix indexes).
-void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *command,
-                        std::vector<int32*> *matrix_args);
-
-/// This function outputs to "matrix_args" the addresses of indexes inside
-/// 'computation' that correspond to matrices.  These live inside
-/// computation->commands and computation->input_output_info; and if
-/// 'include_from_submatrices' is true, then the matrix-indexes present in
-/// computation->submatrices[*].matrix_index will be included too.  Zeros may be
-/// present if there were optional arguments; we do include pointers to them,
-/// but you can just ignore them.
-void IdentifyMatrixArgsInComputation(bool include_from_submatrices,
-                                     NnetComputation *computation,
-                                     std::vector<int32*> *matrix_args);
-
-
 /// Identifies in the vector of commands, arguments that correspond to indexes
 /// into the computation's indexes_multi array, and outputs a list of pointers
 /// to those arguments to 'indexes_multi_args'.  Useful in renumbering code.
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 9d6ff739768..3c6d7e21bd9 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -193,9 +193,8 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet,
       continue;  // nothing to do.
     if (computation->commands[allocate_command].command_type !=
         kAllocMatrixZeroed) {
-      KALDI_ASSERT(computation->commands[allocate_command].command_type ==
-                   kAllocMatrixUndefined);
-      continue;  // already leaving it undefined, so nothing to do.
+      continue;  // already leaving it undefined, or it's an input, so nothing
+                 // to do.
     }
     std::vector<int32> variables_for_matrix;
     a.variables.AppendVariablesForMatrix(matrix_index, &variables_for_matrix);
@@ -294,7 +293,8 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
     if (command.command_type == kAllocMatrixZeroed ||
         command.command_type == kAllocMatrixUndefined ||
         command.command_type == kDeallocMatrix) {
-      int32 m = command.arg1, num_rows = computation->matrices[m].num_rows,
+      int32 s = command.arg1, m = computation->submatrices[s].matrix_index,
+          num_rows = computation->matrices[m].num_rows,
           num_cols = computation->matrices[m].num_cols,
           num_cols_mod = num_cols * (
               computation->matrices[m].stride_type == kDefaultStride ? 1 : -1);
@@ -474,6 +474,15 @@ void Optimize(const NnetOptimizeOptions &config,
 
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, request, *computation, false);
+
+  // The following is not configurable because it is necessary for
+  // the computation to run correctly (we do it after compilation too,
+  // but the operations may have been put out of order by
+  // other optimizations.)
+  ConsolidateIoOperations(nnet, computation);
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
 }
 
 // ComputationRequests are distinguished by the names and indexes
@@ -645,6 +654,82 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
   return computation;
 }
 
+/// Split the computation up into segments bounded internally by kNoOperationMarker.
+/// For each segment, a pair of command-indexes (start, end) is output to the vector
+/// 'segments', so the commands in the segment (not including kNoOperationMarker)
+/// are numbered from start ... end - 1.
+static void SplitComputationIntoSegments(
+    const NnetComputation &computation,
+    std::vector<std::pair<int32, int32> > *segments) {
+
+  int32 num_commands = computation.commands.size();
+  segments->clear();
+  int32 cur_start = 0;
+  for (int32 c = 0; c < num_commands; c++) {
+    if (computation.commands[c].command_type == kNoOperationMarker) {
+      segments->push_back(std::pair<int32, int32>(cur_start, c));
+      cur_start = c + 1;
+    }
+  }
+  segments->push_back(std::pair<int32, int32>(cur_start, num_commands));
+}
+
+
+void ConsolidateIoOperations(const Nnet &nnet,
+                             NnetComputation *computation) {
+  // These segments, represented as (start-index, end-index),
+  // are segments of the computation separated by kNoOperationMarker.
+  std::vector<std::pair<int32, int32> > segments;
+  SplitComputationIntoSegments(*computation, &segments);
+
+  int32 num_commands = computation->commands.size();
+  std::vector<NnetComputation::Command> reordered_commands(num_commands);
+  // put kNoOperationMarker between all segments in the reordered commands.
+  for (size_t s = 0; s + 1 < segments.size(); s++)
+    reordered_commands[segments[s].second].command_type = kNoOperationMarker;
+
+  // for each segment we'll divide the commands up into those that must appear
+  // at the left (start) of the segment, those that must appear in the middle
+  // and those that must appear at the right (end).
+  std::vector<int32> left_commands, middle_commands, right_commands;
+
+  for (size_t s = 0; s < segments.size(); s++) {
+    int32 segment_start = segments[s].first,
+        segment_end = segments[s].second;
+    left_commands.clear();
+    middle_commands.clear();
+    right_commands.clear();
+    for (int32 c = segment_start; c < segment_end; c++) {
+      if (computation->commands[c].command_type == kProvideOutput &&
+          nnet.IsInputNode(computation->commands[c].arg2)) {
+        right_commands.push_back(c);
+      } else if (computation->commands[c].command_type == kProvideOutput ||
+                 computation->commands[c].command_type == kAcceptInput) {
+        left_commands.push_back(c);
+      } else {
+        middle_commands.push_back(c);
+      }
+    }
+    std::vector<int32>::const_iterator iter = left_commands.begin(),
+        end = left_commands.end();
+    int32 c = segment_start;
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    iter = middle_commands.begin();
+    end = middle_commands.end();
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    iter = right_commands.begin();
+    end = right_commands.end();
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    KALDI_ASSERT(c == segment_end);
+  }
+  computation->commands.swap(reordered_commands);
+}
+
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 732f11e29ac..520fe3d34a9 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -298,6 +298,15 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
                                  NnetComputation *computation);
 
 
+/// This optimization puts the I/O operations (kAcceptInput and kProvideOutput
+/// at the very beginning or end of segments of computation.  Specifically:
+/// first the computation is broken up into segments delimited by kNoOperationMarker.
+/// Then, for each segment, all I/O operations are moved to the start of the segment,
+/// *except for* kProvideOutput for inpu nodes (where the network provides an
+/// input-deriv), which is moved to the end of the segment.
+void ConsolidateIoOperations(const Nnet &nnet,
+                             NnetComputation *computation);
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index ef12f0c89d7..f5687ebbe1e 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -68,10 +68,10 @@ void NnetTrainer::Train(const NnetExample &eg) {
                         *nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.io);
-  computer.Forward();
+  computer.Run();
 
   this->ProcessOutputs(eg, &computer);
-  computer.Backward();
+  computer.Run();
 
   UpdateParamsWithMaxChange();
 }
@@ -324,7 +324,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
             CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols(),
                                              kUndefined);
             cu_post.CopyToMat(&output_deriv);
-            computer->AcceptOutputDeriv(output_name, &output_deriv);
+            computer->AcceptInput(output_name, &output_deriv);
           }
           break;
         }
@@ -335,7 +335,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
           *tot_weight = cu_post.Sum();
           *tot_objf = TraceMatMat(output, cu_post, kTrans);
           if (supply_deriv)
-            computer->AcceptOutputDeriv(output_name, &cu_post);
+            computer->AcceptInput(output_name, &cu_post);
           break;
         }
         case kCompressedMatrix: {
@@ -346,7 +346,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
           *tot_weight = cu_post.Sum();
           *tot_objf = TraceMatMat(output, cu_post, kTrans);
           if (supply_deriv)
-            computer->AcceptOutputDeriv(output_name, &cu_post);
+            computer->AcceptInput(output_name, &cu_post);
           break;
         }
       }
@@ -362,7 +362,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
       *tot_weight = diff.NumRows();
       *tot_objf = -0.5 * TraceMatMat(diff, diff, kTrans);
       if (supply_deriv)
-        computer->AcceptOutputDeriv(output_name, &diff);
+        computer->AcceptInput(output_name, &diff);
       break;
     }
     default:
diff --git a/src/nnet3/online-nnet3-decodable-simple.cc b/src/nnet3/online-nnet3-decodable-simple.cc
index c93394dfebd..010dc80991a 100644
--- a/src/nnet3/online-nnet3-decodable-simple.cc
+++ b/src/nnet3/online-nnet3-decodable-simple.cc
@@ -204,7 +204,7 @@ void DecodableNnet3SimpleOnline::DoNnetComputation(
     ivector_feats_cu.Row(0).CopyFromVec(ivector);
     computer.AcceptInput("ivector", &ivector_feats_cu);
   }
-  computer.Forward();
+  computer.Run();
   CuMatrix<BaseFloat> cu_output;
   computer.GetOutputDestructive("output", &cu_output);
   // subtract log-prior (divide by prior)

From 871b39aaa574be28c6d9b4eb1830461fc3521d75 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 8 Oct 2016 17:14:16 -0400
Subject: [PATCH 007/213] Some bug fixes to previous commit (RE refactoring
 code in nnet3).

---
 src/nnet3/am-nnet-simple.h        |  2 +-
 src/nnet3/nnet-computation.cc     | 11 ++++-
 src/nnet3/nnet-computation.h      |  2 +-
 src/nnet3/nnet-compute-test.cc    |  3 +-
 src/nnet3/nnet-compute.cc         |  2 +-
 src/nnet3/nnet-derivative-test.cc | 20 +++++----
 src/nnet3/nnet-optimize-test.cc   | 38 ++++++++---------
 src/nnet3/nnet-optimize-utils.cc  | 71 ++++++++++++++-----------------
 src/nnet3/nnet-optimize-utils.h   | 14 +++---
 9 files changed, 84 insertions(+), 79 deletions(-)

diff --git a/src/nnet3/am-nnet-simple.h b/src/nnet3/am-nnet-simple.h
index 5178c2a054d..c3d8301aa5a 100644
--- a/src/nnet3/am-nnet-simple.h
+++ b/src/nnet3/am-nnet-simple.h
@@ -94,7 +94,7 @@ class AmNnetSimple {
   /// This function works out the left_context_ and right_context_ variables
   /// from the network (it's a rather complex calculation).  You should call
   /// this if you have structurally changed the nnet without calling SetNnet(),
-  /// e.g. using non-const GetNnet().  void SetContext();
+  /// e.g. using non-const GetNnet().
   void SetContext();
  private:
 
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index bcbb47e5fd8..30dbaa94256 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -1090,14 +1090,21 @@ NnetComputation& NnetComputation::operator = (const NnetComputation &other) {
 
 void NnetComputation::GetWholeSubmatrices(
     std::vector<int32> *whole_submatrices) const {
-  whole_submatrices->resize(matrices.size(), 0);
-  int32 num_submatrices = submatrices.size();
+  int32 num_matrices = matrices.size(),
+      num_submatrices = submatrices.size();
+  whole_submatrices->clear();
+  whole_submatrices->resize(num_matrices, 0);
   for (int32 s = 1; s < num_submatrices; s++) {
     if (IsWholeMatrix(s)) {
       int32 m = submatrices[s].matrix_index;
       (*whole_submatrices)[m] = s;
     }
   }
+  for (int32 m = 1; m < num_matrices; m++) {
+    KALDI_ASSERT((*whole_submatrices)[m] != 0 &&
+                 "Matrix exists with no submatrix that is "
+                 "the whole of it.");
+  }
 }
 
 
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 6097b059d23..ba0eaada1a0 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -389,7 +389,7 @@ struct NnetComputation {
 
   // This function outputs a vector, indexed by matrix index, that gives you for
   // each matrix, the index of a submatrix which refers to the whole of that
-  // matrix (or 0 if there is no such submatrix, which should not happen).
+  // matrix; it makes sure that each matrix has such a submatrix.
   void GetWholeSubmatrices(std::vector<int32> *whole_submatrices) const;
 
 
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index f69d4d3036a..afe7da86dc1 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -172,7 +172,7 @@ void UnitTestNnetCompute() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  SetVerboseLevel(4);
+  // SetVerboseLevel(4);
 
 
   for (kaldi::int32 loop = 0; loop < 2; loop++) {
@@ -190,4 +190,3 @@ int main() {
 
   return 0;
 }
-
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index b497e34aac4..7171e6b0273 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -501,7 +501,7 @@ int32 NnetComputer::GetIoMatrixIndex(const std::string &node_name, bool is_outpu
   // if you get the following error it will likely be a bug in the calling code,
   // or possibly due to giving the wrong egs.
   KALDI_ERR << "Could not "
-            << (is_output ? "provide output " : " accept input ")
+            << (is_output ? "provide output " : "accept input ")
             << "for network node " << node_name
             << " (it is not expected at this point in the computation)";
   return 0;  // Suppress compiler warnings; this line will never be reached.
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index 1f8aa7dcfec..0f5f2f6d54a 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -314,13 +314,6 @@ void UnitTestNnetInputDerivatives() {
       compute_opts.debug = true;
     computation.ComputeCudaIndexes();
 
-    // the only reason we might need to provide the &nnet parameter is if the
-    // StoreStats() operation had been requested.  We made sure no model update
-    // is being performed.
-    NnetComputer computer(compute_opts,
-                          computation,
-                          nnet,
-                          &nnet);
 
     int32 num_directions = 3;  // must be >= 1.  Best if it's >1, will reduce
                                // the probability of random failures.
@@ -349,8 +342,18 @@ void UnitTestNnetInputDerivatives() {
     // Other passes are with various differently-perturbed versions of
     // the features.
     for (int32 pass = 0; pass <= num_directions + 1; pass++) {
+      // the only reason we might need to provide the &nnet parameter is if the
+      // StoreStats() operation had been requested.  We made sure no model update
+      // is being performed.
+      NnetComputer computer(compute_opts,
+                            computation,
+                            nnet,
+                            &nnet);
+
+
       // provide the input to the computations.
       for (size_t i = 0; i < request.inputs.size(); i++) {
+
         CuMatrix<BaseFloat> temp(inputs[i]);
         if (pass > 0 && pass <= num_directions) {  // Perturb the input randomly.
           delta_inputs[i].Resize(inputs[i].NumRows(), inputs[i].NumCols());
@@ -425,7 +428,7 @@ void UnitTestNnetInputDerivatives() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  //SetVerboseLevel(2);
+  // SetVerboseLevel(4);
 
 
   for (kaldi::int32 loop = 0; loop < 2; loop++) {
@@ -444,4 +447,3 @@ int main() {
 
   return 0;
 }
-
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 4d61f6f9f4a..97662acc556 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -138,26 +138,26 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     if (request.outputs[0].has_deriv) {
       computer.AcceptInput("output", &output_deriv);
       computer_opt.AcceptInput("output", &output_deriv_opt);
-    }
 
-    KALDI_LOG << "Running non-optimized backward computation";
-    computer.Run();
-    KALDI_LOG << "Running optimized backward computation";
-    computer_opt.Run();
-    for (size_t i = 0; i < request.inputs.size(); i++) {
-      if (request.inputs[i].has_deriv) {
-        const CuMatrixBase<BaseFloat> &in_deriv =
-            computer.GetOutput(request.inputs[i].name);
-        const CuMatrixBase<BaseFloat> &in_deriv_opt =
-            computer_opt.GetOutput(request.inputs[i].name);
-        KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
-                  << "' (non-optimized) is " << in_deriv.Sum();
-        KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
-                  << "' (optimized) is " << in_deriv_opt.Sum();
-        if (!ApproxEqual(in_deriv, in_deriv_opt)) {
-          KALDI_WARN << "Non-optimized and optimized versions of the "
-                     << "computation give different input-derivs.";
-          return false;
+      KALDI_LOG << "Running non-optimized backward computation";
+      computer.Run();
+      KALDI_LOG << "Running optimized backward computation";
+      computer_opt.Run();
+      for (size_t i = 0; i < request.inputs.size(); i++) {
+        if (request.inputs[i].has_deriv) {
+          const CuMatrixBase<BaseFloat> &in_deriv =
+              computer.GetOutput(request.inputs[i].name);
+          const CuMatrixBase<BaseFloat> &in_deriv_opt =
+              computer_opt.GetOutput(request.inputs[i].name);
+          KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
+                    << "' (non-optimized) is " << in_deriv.Sum();
+          KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
+                    << "' (optimized) is " << in_deriv_opt.Sum();
+          if (!ApproxEqual(in_deriv, in_deriv_opt)) {
+            KALDI_WARN << "Non-optimized and optimized versions of the "
+                       << "computation give different input-derivs.";
+            return false;
+          }
         }
       }
     }
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index c84cb82733f..b2f171a0670 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -919,7 +919,7 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     // submatrix numbered 'new_submatrix' the contents of the submatrix numbered
     // 'submatrices[i]'.  Note: we hope that a later pass of optimization
     // (VariableMergingOptimization) will remove this redundant copy by
-    // having the operation that created it right directly to the location
+    // having the operation that created it write directly to the location
     // we want it to be.
     NnetComputation::Command c(kMatrixCopy, new_submatrix, submatrices[i]);
     extra_commands_[commands[i]].push_back(c);
@@ -1123,8 +1123,8 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
         command->arg5 = mapped_output_deriv_submatrix;
         command->arg6 = mapped_input_deriv_submatrix;
       }
-    }
       break;
+    }
     case kMatrixCopy: case kMatrixAdd:
       MapSimpleMatrixCommand(command);
       break;
@@ -1162,7 +1162,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     c->command_type = kNoOperation;
     return;
   }
-  // left_prune1 is the nmber of rows pruned away on the left for submatrix1.
+  // left_prune1 is the number of rows pruned away on the left for submatrix1.
   int32 orig_num_rows = computation_->submatrices[submatrix1].num_rows,
       left_prune1, left_prune2, right_prune1, right_prune2;
   GetPruneValues(submatrix1, submatrix1_mapped, &left_prune1, &right_prune1);
@@ -1184,7 +1184,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     } else {
       int32 num_rows = orig_num_rows - left_prune - right_prune;
       // note: the call NewSubMatrix effectively gives us a sub-matrix of a
-      // subm-matrix.
+      // sub-matrix.
       c->arg1 = computation_->NewSubMatrix(submatrix1,
                                            left_prune, num_rows, 0, -1);
       c->arg2 = computation_->NewSubMatrix(submatrix2,
@@ -1394,7 +1394,7 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
       max_deriv_time_ == std::numeric_limits<BaseFloat>::max())
     return;  // nothing to do.
 
-  EnsureMatricesHaveEntireSubmatrices();
+  computation_->GetWholeSubmatrices(&whole_submatrices_);
   ComputeMatrixPruneInfo();
   ComputeSubmatrixMaps();
   ModifyCommands();
@@ -1403,20 +1403,6 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
   RenumberComputation(computation_);
 }
 
-void DerivativeTimeLimiter::EnsureMatricesHaveEntireSubmatrices() {
-  int32 num_matrices = computation_->matrices.size(),
-      num_submatrices = computation_->submatrices.size();
-  entire_submatrix_.clear();
-  entire_submatrix_.resize(num_matrices, -1);
-  entire_submatrix_[0] = 0;
-  for (int32 s = 1; s < num_submatrices; s++)
-    if (computation_->IsWholeMatrix(s))
-      entire_submatrix_[computation_->submatrices[s].matrix_index] = s;
-  for (int32 m = 1; m < num_matrices; m++)
-    if (entire_submatrix_[m] == -1)
-      entire_submatrix_[m] = computation_->NewSubMatrix(m, 0, -1, 0, -1);
-}
-
 void DerivativeTimeLimiter::ComputeMatrixPruneInfo() {
   KALDI_ASSERT(computation_->matrix_debug_info.size() ==
                computation_->matrices.size() &&
@@ -1517,20 +1503,20 @@ void DerivativeTimeLimiter::ModifyCommands() {
 // desired range are never accessed), and false otherwise.
 bool DerivativeTimeLimiter::CanLimitMatrix(const Analyzer &analyzer,
                                            int32 m) const {
-  int32 s_entire = entire_submatrix_[m];  // submatrix consisting of
+  int32 s_whole = whole_submatrices_[m];  // submatrix consisting of
                                                      // all of the matrix.
-  int32 s_mapped = submatrix_map_[s_entire];  // the matrix limited in time.
-  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_entire);
-  std::vector<int32> entire_variables, mapped_variables;
-  analyzer.variables.AppendVariablesForSubmatrix(s_entire,
-                                                 &entire_variables);
+  int32 s_mapped = submatrix_map_[s_whole];  // the matrix limited in time.
+  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_whole);
+  std::vector<int32> whole_variables, mapped_variables;
+  analyzer.variables.AppendVariablesForSubmatrix(s_whole,
+                                                 &whole_variables);
   analyzer.variables.AppendVariablesForSubmatrix(s_mapped,
                                                  &mapped_variables);
-  KALDI_ASSERT(entire_variables.size() > mapped_variables.size());
-  std::vector<int32> excluded_variables(entire_variables.size() -
+  KALDI_ASSERT(whole_variables.size() > mapped_variables.size());
+  std::vector<int32> excluded_variables(whole_variables.size() -
                                         mapped_variables.size());
   std::vector<int32>::iterator end_iter =
-      std::set_difference(entire_variables.begin(), entire_variables.end(),
+      std::set_difference(whole_variables.begin(), whole_variables.end(),
                           mapped_variables.begin(), mapped_variables.end(),
                           excluded_variables.begin());
   KALDI_ASSERT(end_iter == excluded_variables.end());
@@ -1579,15 +1565,24 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
         // rows to the left.
         submat_info.row_offset = new_row_begin;
       } else {
-        // This submatrix is not entirely the kept range of the matrix.
-        // We assume that this submatrix is never accessed directly (as when
-        // we modified the computation we ensured this).  We
-        // give it a valid but stupid size of num-rows=1, num-cols=1, so
-        // that if it ever does get accessed it should produce an error.
-        submat_info.row_offset = 0;
-        submat_info.num_rows = 1;
-        submat_info.col_offset = 0;
-        submat_info.num_cols = 1;
+        // This submatrix is not entirely inside the kept range of the matrix.
+        // We assume that this submatrix is never accessed directly except (if
+        // it was the whole matrix) for in allocation and deallocation commands,
+        // since when we modified the computation we ensured this.
+        if (computation_->IsWholeMatrix(s)) {
+          // If it was the whole matrix then it may be used in allocation and
+          // deallocation commands, so we should modify it to be the whole of the
+          // new matrix, which will have fewer rows than before.
+          submat_info.num_rows = matrix_num_rows;
+        } else {
+          // We believe this matrix should never be used.  We give it a valid
+          // but stupid size of num-rows=1, num-cols=1, so that if it ever does
+          // get accessed it should produce an error.
+          submat_info.row_offset = 0;
+          submat_info.num_rows = 1;
+          submat_info.col_offset = 0;
+          submat_info.num_cols = 1;
+        }
       }
     }
   }
@@ -1614,7 +1609,7 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
 void DerivativeTimeLimiter::PruneMatrices() {
   Analyzer analyzer;
   analyzer.Init(nnet_, *computation_);
-  KALDI_ASSERT(computation_->matrices.size() == entire_submatrix_.size());
+  KALDI_ASSERT(computation_->matrices.size() == whole_submatrices_.size());
   int32 num_matrices = computation_->matrices.size();
   std::vector<bool> will_limit(num_matrices, false);
   bool will_limit_at_least_one = false;
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index b833a4cafe9..9b3b640d817 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -362,6 +362,13 @@ class ComputationRenumberer {
 };
 
 
+// Class DerivativeTimeLimiter is used inside LimitDerivativeTimes().
+// Its function is to modify the computation so that we don't work
+// with derivatives outside of a specified range of t values; this is
+// useful, for instance, in BLSTMs where you might have a fair amount of
+// left and right context in the training examples but don't want to
+// propagate the derivatives to there.
+//
 // We require that the computation have debug info set up
 // (!matrix_debug_info.empty()) and that this be the first
 // optimization you perform.  This means that the debug_info will
@@ -378,11 +385,6 @@ class DerivativeTimeLimiter {
 
  private:
 
-  // This command ensures that for each matrix m there is a corresponding
-  // submatrix that spans the entire matrix, and stores its index in
-  // entire_submatrix_[m].
-  void EnsureMatricesHaveEntireSubmatrices();
-
   // sets up matrix_prune_info_.
   void ComputeMatrixPruneInfo();
 
@@ -478,7 +480,7 @@ class DerivativeTimeLimiter {
 
   // for each matrix index > 0, the index of a submatrix that consists of
   // the entirety of that matrix.
-  std::vector<int32> entire_submatrix_;
+  std::vector<int32> whole_submatrices_;
 
   std::vector<MatrixPruneInfo> matrix_prune_info_;
 

From 7efa27f61eca81cb67148fa9bee6bfc310c90285 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 10 Oct 2016 18:49:20 -0400
Subject: [PATCH 008/213] Further refactoring to nnet3 compilation to make it
 easier to implement online computation.

---
 src/nnet3/nnet-compile.cc           | 217 +++++++++------
 src/nnet3/nnet-compile.h            |  37 ++-
 src/nnet3/nnet-computation-graph.cc | 412 +++++++++++++++++-----------
 src/nnet3/nnet-computation-graph.h  |  97 ++++---
 src/nnet3/nnet-optimize.cc          |  11 +-
 src/nnet3/nnet-optimize.h           |   9 +-
 src/nnet3/nnet-utils.cc             |   4 +-
 7 files changed, 486 insertions(+), 301 deletions(-)

diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index ae3073b6265..8e70ecb4c4c 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -28,30 +28,66 @@ namespace nnet3 {
 
 Compiler::Compiler(
     const ComputationRequest &request,
-    const Nnet &nnet): request_(request), nnet_(nnet) { }
+    const Nnet &nnet): nnet_(nnet) {
+  requests_.push_back(&request);
+}
 
+Compiler::Compiler(
+    const std::vector<const ComputationRequest*> &requests,
+    const Nnet &nnet): requests_(requests), nnet_(nnet) {
+  KALDI_ASSERT(requests_.size() >= 1);
+  // We are currently not supporting getting model derivatives for multi-segment
+  // (online) computations.
+  if (requests_.size() != 1) {
+    for (size_t i = 0; i < requests_.size(); i++) {
+      KALDI_ASSERT(!requests_[i]->need_model_derivative);
+      KALDI_ASSERT(requests_[i]->store_component_stats ==
+                   requests_[0]->store_component_stats);
+    }
+  }
+}
 
 void Compiler::CreateComputation(const CompilerOptions &opts,
                                  NnetComputation *computation) {
   computation->Clear();
-  ComputationGraphBuilder builder(nnet_, request_, &graph_);
-  builder.Compute();
-  if (!builder.AllOutputsAreComputable()) {
-    builder.ExplainWhyAllOutputsNotComputable();  // prints logging info
-    KALDI_ERR << "Not all outputs were computable, cannot create computation.";
+  ComputationGraphBuilder builder(nnet_, &graph_);
+  for (size_t segment = 0; segment < requests_.size(); segment++) {
+    builder.Compute(*(requests_[segment]));
+    if (!builder.AllOutputsAreComputable()) {
+      builder.ExplainWhyAllOutputsNotComputable();  // prints logging info
+      KALDI_ERR << "Not all outputs were computable, cannot create computation.";
+    }
+    builder.Prune();
   }
-  builder.Prune();
   // see function declaration's comment for meaning of "phases".
-  std::vector<std::vector<int32> > phases;
-  ComputeComputationPhases(nnet_, graph_, &phases);
+  std::vector<std::vector<std::vector<int32> > > phases_per_segment;
+  ComputeComputationPhases(nnet_, graph_, &phases_per_segment);
   std::vector<std::vector<int32> > steps;
-  ComputeComputationSteps(nnet_, request_, phases, &graph_, &steps);
-  phases.clear();
+  steps.reserve(1000);
+
+  // maps each step to the segment in which it appears.  in the normal case
+  // (non-online computation), a vector of all zeros.
+  std::vector<int32> step_to_segment;
+
+  for (size_t segment = 0; segment < requests_.size(); segment++) {
+    std::vector<std::vector<int32> > this_segment_steps;
+    ComputeComputationSteps(nnet_, *(requests_[segment]),
+                            phases_per_segment[segment], &graph_,
+                            &this_segment_steps);
+    for (size_t i = 0; i < this_segment_steps.size(); i++) {
+      steps.push_back(std::vector<int32>());
+      steps.back().swap(this_segment_steps[i]);
+      step_to_segment.push_back(segment);
+    }
+  }
+  // TODO (?) check that the total num_cindexes in the steps in >=
+  // graph->cindexes.size().  could do it inside CreateLocationInfo().
+  phases_per_segment.clear();
   CreateLocationInfo(steps);
   std::vector<bool> deriv_needed;
-  ComputeDerivNeeded(steps, &deriv_needed);
-  CreateStepInfo(deriv_needed, &steps, computation);
-  AddCommands(deriv_needed, computation);
+  ComputeDerivNeeded(steps, step_to_segment, &deriv_needed);
+  CreateStepInfo(deriv_needed, step_to_segment, &steps, computation);
+  AddCommands(deriv_needed, step_to_segment, computation);
   // the following command reorders commands so kAcceptInput and kProvideOutput
   // appear in the desired places.
   ConsolidateIoOperations(nnet_, computation);
@@ -60,8 +96,9 @@ void Compiler::CreateComputation(const CompilerOptions &opts,
 }
 
 void Compiler::AddCommands(const std::vector<bool> &deriv_needed,
+                           const std::vector<int32> &step_to_segment,
                            NnetComputation *computation) {
-  computation->need_model_derivative = request_.need_model_derivative;
+  computation->need_model_derivative = requests_[0]->need_model_derivative;
   int32 arbitrary_factor = 8;
   computation->commands.reserve(computation->matrices.size()
                                 * arbitrary_factor);
@@ -69,51 +106,27 @@ void Compiler::AddCommands(const std::vector<bool> &deriv_needed,
   std::vector<int32> whole_submatrices;
   computation->GetWholeSubmatrices(&whole_submatrices);
   AllocateMatrices(whole_submatrices, computation);
-  SetUpPrecomputedIndexes(computation);
+  SetUpPrecomputedIndexes(step_to_segment, computation);
   int32 num_steps = steps_.size();
-  for (int32 step = 0; step < num_steps; step++)
+  for (int32 step = 0; step < num_steps; step++) {
     DoForwardComputation(step, computation);
+    if (step + 1 < static_cast<int32>(step_to_segment.size()) &&
+        step_to_segment[step + 1] != step_to_segment[step]) {
+      // insert a marker that separates segments of the computation.
+      computation->commands.push_back(
+          NnetComputation::Command(kNoOperationMarker));
+    }
+  }
 
-  AddCommandsAfterPropagate(deriv_needed, computation);
-
-  for (int32 step = num_steps - 1; step >= 0; step--)
-    if (deriv_needed[step])
-      DoBackwardComputation(step, computation);
-  DeallocateMatrices(whole_submatrices, computation);
-}
-
-void Compiler::AddCommandsAfterPropagate(const std::vector<bool> &deriv_needed,
-                                         NnetComputation *computation) {
   // mark the end of the forward phase.
   computation->commands.push_back(
       NnetComputation::Command(kNoOperationMarker));
 
-  std::vector<NnetComputation::Command> deriv_input_commands;
+  for (int32 step = num_steps - 1; step >= 0; step--)
+    if (deriv_needed[step])
+      DoBackwardComputation(step, computation);
 
-  // We handle output nodes here-- add commands that relate to us providing
-  // outputs to the user; then, if applicable, we add commands to direct us to
-  // accept derivatives w.r.t. those outputs from the user.
-  int32 num_steps = steps_.size();
-  for (int32 step = 0; step < num_steps; step++) {
-    const StepInfo &step_info = steps_[step];
-    if (nnet_.IsOutputNode(step_info.node_index)) {
-      int32 node_index = step_info.node_index,
-          submatrix_index = step_info.value;
-      KALDI_ASSERT(computation->IsWholeMatrix(submatrix_index));
-      NnetComputation::Command c(kProvideOutput, submatrix_index, node_index);
-      computation->commands.push_back(c);
-      if (deriv_needed[step]) {
-        int32 deriv_submatrix_index = step_info.deriv;
-        KALDI_ASSERT(deriv_submatrix_index > 0);
-        KALDI_ASSERT(computation->IsWholeMatrix(deriv_submatrix_index));
-        NnetComputation::Command c(kAcceptInput, deriv_submatrix_index, node_index);
-        deriv_input_commands.push_back(c);
-      }
-    }
-  }
-  computation->commands.insert(computation->commands.end(),
-                               deriv_input_commands.begin(),
-                               deriv_input_commands.end());
+  DeallocateMatrices(whole_submatrices, step_to_segment, computation);
 }
 
 
@@ -153,7 +166,11 @@ void Compiler::ComputeStepDependencies(
 
 void Compiler::ComputeDerivNeeded(
     const std::vector<std::vector<int32> > &steps,
+    const std::vector<int32> &step_to_segment,
     std::vector<bool> *deriv_needed) {
+  KALDI_ASSERT(steps.size() == step_to_segment.size() &&
+               step_to_segment[0] == 0 &&
+               step_to_segment.back() + 1 == requests_.size());
   deriv_needed->clear();
   int32 num_steps = steps.size();
   deriv_needed->resize(num_steps, false);
@@ -181,33 +198,30 @@ void Compiler::ComputeDerivNeeded(
     }
     // if this step is an input and the user requested the derivative w.r.t. that
     // input, we need the derivative.
+    const ComputationRequest &request = *(requests_[step_to_segment[step]]);
+
     if (is_input) {
-      int32 input_index = request_.IndexForInput(node_name);
+      int32 input_index = request.IndexForInput(node_name);
       KALDI_ASSERT(input_index != -1);
-      if (request_.inputs[input_index].has_deriv)
+      if (request.inputs[input_index].has_deriv)
         (*deriv_needed)[step] = true;
     }
     // if this step is an output and the user is providing the derivative w.r.t. that
     // output, we need a place to store the derivative, so we set (*deriv_needed) to
     // true.
     if (nnet_.IsOutputNode(node_index)) {
-      int32 output_index = request_.IndexForOutput(node_name);
+      int32 output_index = request.IndexForOutput(node_name);
       KALDI_ASSERT(output_index != -1);
-      if (request_.outputs[output_index].has_deriv)
+      if (request.outputs[output_index].has_deriv)
         (*deriv_needed)[step] = true;
     }
-    // If this is an updatable Component node with a nonzero learning rate and
-    // the user requested model derivatives (e.g. during training), we need this
-    // step's derivative.
-    if (nnet_.IsComponentNode(node_index) && request_.need_model_derivative) {
+    // If this is an updatable Component node and the user requested model
+    // derivatives (e.g. during training), we need this step's derivative.
+    if (nnet_.IsComponentNode(node_index) && request.need_model_derivative) {
       const NetworkNode &node = nnet_.GetNode(node_index);
       const Component *c = nnet_.GetComponent(node.u.component_index);
-      if (c->Properties() & kUpdatableComponent) {
-        const UpdatableComponent *u = dynamic_cast<const UpdatableComponent*>(c);
-        KALDI_ASSERT(u != NULL);
-        if (u->LearningRate() != 0)
-          (*deriv_needed)[step] = true;
-      }
+      if (c->Properties() & kUpdatableComponent)
+        (*deriv_needed)[step] = true;
     }
   }
   if (GetVerboseLevel() >= 5) {
@@ -249,6 +263,7 @@ MatrixStrideType Compiler::GetStrideType(int32 node_index) const {
 // function destroys it.
 void Compiler::CreateStepInfo(
     const std::vector<bool> &deriv_needed,
+    const std::vector<int32> &step_to_segment,
     std::vector<std::vector<int32> > *by_step,
     NnetComputation *computation) {
   KALDI_ASSERT(!by_step->empty());
@@ -257,6 +272,7 @@ void Compiler::CreateStepInfo(
   for (int32 step = 0; step < num_steps; step++) {
     StepInfo &this_info = steps_[step];
     this_info.output_cindex_ids.swap((*by_step)[step]);
+    this_info.segment = step_to_segment[step];
     int32 num_ids = this_info.output_cindex_ids.size();
     this_info.output_indexes.resize(num_ids);
     for (int32 row_index = 0; row_index < num_ids; row_index++)
@@ -341,11 +357,14 @@ void Compiler::CreateStepInfo(
 void Compiler::CreateLocationInfo(
     const std::vector<std::vector<int32> > &by_step) {
   cindex_id_to_location_.clear();
-  int32 num_cindex_ids = graph_.cindexes.size();
+  int32 num_cindex_ids = graph_.cindexes.size(),
+      total_cindex_ids = 0;
   cindex_id_to_location_.resize(num_cindex_ids, std::pair<int32,int32>(-1,-1));
   int32 num_steps = by_step.size();
   for (int32 step = 0; step < num_steps; step++) {
+    // output_cindex_ids is the cindex_ids that this step produces.
     const std::vector<int32> &output_cindex_ids = by_step[step];
+    total_cindex_ids += output_cindex_ids.size();
     int32 num_rows = output_cindex_ids.size();
     for (int32 row = 0; row < num_rows; row++) {
       int32 cindex_id = output_cindex_ids[row];
@@ -359,6 +378,11 @@ void Compiler::CreateLocationInfo(
       cindex_id_to_location_[cindex_id] = std::pair<int32,int32>(step, row);
     }
   }
+  // All cindex_ids in the graph must be present in a step, which is why
+  // we make the following assert.  In general this will be with equality,
+  // but I believe there might be some weird edge cases, maybe involving
+  // kDimRange nodes, that would make this not true.  [not 100% sure.]
+  KALDI_ASSERT(total_cindex_ids >= num_cindex_ids);
 }
 
 void Compiler::DoForwardComputation(int32 step,
@@ -388,6 +412,17 @@ void Compiler::DoForwardComputationDescriptor(
   int32 num_parts = steps_[step].value_parts.size();
   for (int32 part = 0; part < num_parts; part++)
     DoForwardComputationSumDescriptor(step, part, computation);
+  const StepInfo &step_info = steps_[step];
+  if (nnet_.IsOutputNode(step_info.node_index)) {
+    // If the node is an output then we need to add commands to provide the
+    // output to the user, and possibly to get derivatives w.r.t. the output
+    // from the user.
+    int32 node_index = step_info.node_index,
+        submatrix_index = step_info.value;
+    KALDI_ASSERT(computation->IsWholeMatrix(submatrix_index));
+    NnetComputation::Command c(kProvideOutput, submatrix_index, node_index);
+    computation->commands.push_back(c);
+  }
 }
 
 
@@ -634,16 +669,22 @@ void Compiler::DoBackwardComputationFromSubmatLocations(
   // trickier to implement efficiently on the GPU, there may be cases
   // which we will refuse to implement backprop for if we get here.
 
-
-
-  int32 first_value;
-  std::vector<int32> second_values;
-  if (ConvertToIndexes(submat_locations, &first_value,
-                       &second_values)) {
-    int32 input_deriv_submatrix_index = first_value;
+  int32 num_rows = submat_locations.size();
+  std::vector<std::pair<int32, int32> >::const_iterator
+      iter = submat_locations.begin(), end = submat_locations.end();
+  int32 first_submat = iter->first;
+  for (++iter; iter != end; ++iter)
+    if (iter->first != first_submat)
+      break;
+  bool all_same_submatrix = (iter == end);
+  if (all_same_submatrix) {
+    int32 input_deriv_submatrix_index = first_submat;
+    std::vector<int32> indexes(num_rows);
+    for (int32 i = 0; i < num_rows; i++)
+      indexes[i] = submat_locations[i].second;
     DoBackwardComputationFromIndexes(deriv_submatrix_index,
                                      input_deriv_submatrix_index,
-                                     second_values,
+                                     indexes,
                                      computation);
     return;
   } else {
@@ -761,6 +802,15 @@ void Compiler::DoBackwardComputationFromIndexes(
 void Compiler::DoBackwardComputationDescriptor(
     int32 step, NnetComputation *computation) {
   StepInfo &step_info = steps_[step];
+  if (nnet_.IsOutputNode(step_info.node_index) &&
+      step_info.deriv > 0) {
+    int32 deriv_submatrix_index = step_info.deriv;
+    KALDI_ASSERT(computation->IsWholeMatrix(deriv_submatrix_index));
+    NnetComputation::Command c(kAcceptInput, deriv_submatrix_index,
+                               step_info.node_index);
+    computation->commands.push_back(c);
+  }
+
   // the top-level descriptor has a bunch of parts that we concatenate features
   // over.
   int32 num_parts = step_info.value_parts.size();
@@ -833,7 +883,7 @@ void Compiler::AddForwardStepComponent(int32 step,
                              output_submatrix_index);
   computation->commands.push_back(c);
 
-  if (request_.store_component_stats) {
+  if (requests_[0]->store_component_stats) {
     const Component *c = nnet_.GetComponent(node.u.component_index);
     if (c->Properties() & kStoresStats) {
       NnetComputation::Command c(kStoreStats,
@@ -948,6 +998,7 @@ void Compiler::AllocateMatrices(const std::vector<int32> &whole_submatrices,
 
 
 void Compiler::SetUpPrecomputedIndexes(
+    const std::vector<int32> &step_to_segment,
     NnetComputation *computation) {
   int32 num_steps = steps_.size();
   KALDI_ASSERT(computation->component_precomputed_indexes.empty());
@@ -968,9 +1019,10 @@ void Compiler::SetUpPrecomputedIndexes(
 
     const Component *component = nnet_.GetComponent(component_index);
 
-    bool need_derivs = request_.NeedDerivatives();
+    const ComputationRequest &request = *(requests_[step_to_segment[step]]);
+    bool need_derivs = request.NeedDerivatives();
     ComponentPrecomputedIndexes *precomputed_indexes =
-        component->PrecomputeIndexes(request_.misc_info,
+        component->PrecomputeIndexes(request.misc_info,
                                      input_indexes, output_indexes,
                                      need_derivs);
     if (precomputed_indexes == NULL) {
@@ -985,8 +1037,8 @@ void Compiler::SetUpPrecomputedIndexes(
   }
 }
 
-
 void Compiler::DeallocateMatrices(const std::vector<int32> &whole_submatrices,
+                                  const std::vector<int32> &step_to_segment,
                                   NnetComputation *computation) {
   // This adds the commands to destroy all the matrices- but not the
   // ones that might be needed as outputs of the computation.  The ones that
@@ -999,6 +1051,7 @@ void Compiler::DeallocateMatrices(const std::vector<int32> &whole_submatrices,
   int32 num_steps = steps_.size();
   for (int32 step = 0; step < num_steps; step++) {
     const StepInfo &step_info = steps_[step];
+    const ComputationRequest &request = *(requests_[step_to_segment[step]]);
     if (nnet_.IsOutputNode(step_info.node_index)) {
       // steps corresponding to output nodes need to have their "value" kept.
       int32 value_matrix_index =
@@ -1010,11 +1063,11 @@ void Compiler::DeallocateMatrices(const std::vector<int32> &whole_submatrices,
       // need to worry about whether outputs were requested, because if they
       // were not requested we would not be computing them in the first place).
       std::string input_name = nnet_.GetNodeNames()[step_info.node_index];
-      int32 i = 0, num_inputs = request_.inputs.size();
+      int32 i = 0, num_inputs = request.inputs.size();
       bool has_deriv = false;
       for (; i < num_inputs; i++) {
-        if (input_name == request_.inputs[i].name) {
-          has_deriv = request_.inputs[i].has_deriv;
+        if (input_name == request.inputs[i].name) {
+          has_deriv = request.inputs[i].has_deriv;
           break;
         }
       }
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index 195ac36006a..8b9e738d251 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -43,14 +43,23 @@ struct CompilerOptions {
 /// nnet-optimize.h.
 class Compiler {
  public:
+  // Constructor that takes one computation request (this is the normal case).
   Compiler(const ComputationRequest &request,
            const Nnet &nnet);
 
+  // Constructor with a sequence of computation requests, for multiple
+  // computation segments (used when creating online computations).
+  Compiler(const std::vector<const ComputationRequest*> &request,
+           const Nnet &nnet);
+
   void CreateComputation(const CompilerOptions &opts,
                          NnetComputation *computation);
 
  private:
-  const ComputationRequest &request_;
+  // requests_ is the sequence of computation requests, one for each segment; it
+  // will contain just one element in the normal case, but more when we're
+  // compiling a multi-segment / 'online' computation.
+  std::vector<const ComputationRequest*> requests_;
   const Nnet &nnet_;
   ComputationGraph graph_;
 
@@ -65,6 +74,11 @@ class Compiler {
                   // if not used (note: index zero is reserved for the empty
                   // matrix).
 
+    int32 segment;  // normally 0 except for online/multi-segment computations,
+                    // identifies the segment of which this step is a part (each
+                    // segment in the sequence has a different
+                    // ComputationRequest).
+
     // precomputed_indexes_index is the index into the
     // component_precomputed_indexes array in the NnetComputation, or zero if
     // none needed.
@@ -92,7 +106,7 @@ class Compiler {
     // backprop.
     std::vector<std::vector<std::vector<std::pair<int32,int32> > > > input_locations_list;
 
-    StepInfo(): node_index(-1), value(0), deriv(0),
+    StepInfo(): node_index(-1), value(0), deriv(0), segment(0),
                 precomputed_indexes_index(0) { }
   };
 
@@ -112,12 +126,19 @@ class Compiler {
   // whether, for that step, we need to allocate the matrix of derivatives
   // (interpret this as being at the output of that step).  This variable
   // also tells us whether we need to execute the backprop code for that step.
+  //  'steps' is a vector of steps; each step is a list of cindexes.
+  //  'step_to_segment', which should have the same dimension as 'steps',
+  //    maps from step index to the segment it occurs in (only interesting
+  //    for multi-segment/online computations).
+  //  'deriv_needed' will be given the same length as 'steps'.
   void ComputeDerivNeeded(const std::vector<std::vector<int32> > &steps,
+                          const std::vector<int32> &step_to_segment,
                           std::vector<bool> *deriv_needed);
 
   // this sets up steps_, destroying the input "by_step" in the process.  It
   // also sets various matrix and sub-matrix sizes in "computation".
   void CreateStepInfo(const std::vector<bool> &deriv_needed,
+                      const std::vector<int32> &step_to_segment,
                       std::vector<std::vector<int32> > *by_step,
                       NnetComputation *computation);
 
@@ -155,7 +176,8 @@ class Compiler {
 
   // Sets up the precomputed indexes for each component, and sets the
   // precomputed_indexes_index value for each step.
-  void SetUpPrecomputedIndexes(NnetComputation *computation);
+  void SetUpPrecomputedIndexes(const std::vector<int32> &step_to_segment,
+                               NnetComputation *computation);
 
   // Adds to "computation" the command(s) for the forward computation
   // for this step.
@@ -294,19 +316,14 @@ class Compiler {
   // 'whole_submatrices' is as created by computation->GetWholeSubmatrices(), it
   // gives us the index of a submatrix containing the whole of each matrix.
   void DeallocateMatrices(const std::vector<int32> &whole_submatrices,
+                          const std::vector<int32> &step_to_segment,
                           NnetComputation *computation);
 
   // sets up the debug_info member of "computation".
   void OutputDebugInfo(NnetComputation *computation) const;
 
-
-  // this function, called from AddCommands, adds the output and input
-  // commands that happen after the forward pass and before the backward
-  // pass.
-  void AddCommandsAfterPropagate(const std::vector<bool> &deriv_needed,
-                                 NnetComputation *computation);
-
   void AddCommands(const std::vector<bool> &deriv_needed,
+                   const std::vector<int32> &step_to_segment,
                    NnetComputation *computation);
 
 };
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index 43427fb39e4..cf43ca9f804 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -54,52 +54,72 @@ int32 ComputationGraph::GetCindexId(const Cindex &cindex) const {
 }
 
 
-void ComputationGraph::Renumber(const std::vector<bool> &keep) {
-  int32 num_cindex_ids = cindexes.size();
-  KALDI_ASSERT(keep.size() == num_cindex_ids);
-  ComputationGraph temp_graph;
-  std::vector<int32> old2new(num_cindex_ids, -1), new2old;
-  new2old.reserve(num_cindex_ids);
-  for (int32 j = 0; j < num_cindex_ids; j++) {
+void ComputationGraph::Renumber(int32 start_cindex_id,
+                                const std::vector<bool> &keep) {
+  int32 old_num_cindex_ids = cindexes.size();
+  KALDI_ASSERT(keep.size() == old_num_cindex_ids - start_cindex_id);
+  // count_before_renumbering is the number of cindex_ids >= start_cindex_id,
+  // before renumbering.
+  int32 count_before_renumbering = old_num_cindex_ids - start_cindex_id;
+  std::vector<int32> old2new(count_before_renumbering, -1), new2old;
+  new2old.reserve(old_num_cindex_ids);
+  for (int32 j = 0; j < count_before_renumbering; j++) {
     if (keep[j]) {
-      old2new[j] = new2old.size();
-      new2old.push_back(j);
+      old2new[j] = new2old.size() + start_cindex_id;
+      new2old.push_back(j + start_cindex_id);
     }
   }
-  int32 new_num_cindex_ids = new2old.size();
-  if (new_num_cindex_ids == num_cindex_ids) {
+  // count_after_renumbering is the number of cindex_ids >= start_cindex_id,
+  // after renumbering.
+  int32 count_after_renumbering = new2old.size(),
+      new_num_cindex_ids = start_cindex_id + count_after_renumbering;
+  if (count_after_renumbering == count_before_renumbering) {
     // this is an optimization for when we are not deleting any
     // cindex-ids.
     return;
   }
-  temp_graph.cindexes.resize(new_num_cindex_ids);
-  temp_graph.is_input.resize(new_num_cindex_ids);
-  temp_graph.dependencies.resize(new_num_cindex_ids);
-  for (int32 c = 0; c < new_num_cindex_ids; c++) {
-    int32 d = new2old[c];
-    temp_graph.cindexes[c] = cindexes[d];
-    temp_graph.is_input[c] = is_input[d];
-    temp_graph.dependencies[c].reserve(dependencies[d].size());
+
+  for (int32 old_cindex_id = start_cindex_id;
+       old_cindex_id < old_num_cindex_ids; old_cindex_id++) {
+    int32 new_cindex_id = old2new[old_cindex_id - start_cindex_id];
+    Cindex &cindex = cindexes[old_cindex_id];
+    if (new_cindex_id == -1) {
+      cindex_to_cindex_id_.erase(cindex);
+    } else if (new_cindex_id != old_cindex_id) {
+      cindex_to_cindex_id_[cindex] = new_cindex_id;
+    }
+  }
+
+  std::vector<int32> temp;
+  for (int32 c = start_cindex_id; c < new_num_cindex_ids; c++) {
+    int32 d = new2old[c - start_cindex_id];
+    // note: d >= c, which is why we do not overwrite data here.
+    KALDI_PARANOID_ASSERT(d >= c);
+    cindexes[c] = cindexes[d];
+    is_input[c] = is_input[d];
+    // if c == d, we need to create a temporary copy.
+    const std::vector<int32> &src_dependencies =
+        (c == d ? (temp = dependencies[d]) : dependencies[d]);
     std::vector<int32>::const_iterator
-        iter = dependencies[d].begin(), end = dependencies[d].end();
+        iter = src_dependencies.begin(), end = src_dependencies.end();
+    dependencies[c].clear();
     for (; iter != end; ++iter) {
-      int32 old_dep = *iter, new_dep = old2new[old_dep];
-      if (new_dep != -1)
-        temp_graph.dependencies[c].push_back(new_dep);
-      else
-        KALDI_ERR << "Dependency on nonexistent cindex-id";
+      int32 old_dep = *iter;
+      if (old_dep < start_cindex_id) {
+        dependencies[c].push_back(old_dep);
+      } else {
+        int32 new_dep = old2new[old_dep - start_cindex_id];
+        if (new_dep != -1)
+          dependencies[c].push_back(new_dep);
+        else
+          KALDI_ERR << "Dependency on nonexistent cindex-id";
+      }
     }
   }
 
-  // at this point, rather than setting up cindex_to_cindex_id_ on the temporary
-  // graph, we copy cindexes, is_input and dependencies to this graph, and then
-  // set up cindex_to_cindex_id_ locally.
-  cindexes.swap(temp_graph.cindexes);
-  is_input.swap(temp_graph.is_input);
-  dependencies.swap(temp_graph.dependencies);
-  cindex_to_cindex_id_.clear();
-  for (int32 c = 0; c < new_num_cindex_ids; c++)
-    cindex_to_cindex_id_[cindexes[c]] = c;
+  cindexes.resize(new_num_cindex_ids);
+  is_input.resize(new_num_cindex_ids);
+  dependencies.resize(new_num_cindex_ids);
 }
 
 void ComputationGraphBuilder::PrintCindexId(std::ostream &os,
@@ -229,17 +249,17 @@ void ComputationGraphBuilder::AddCindexId(int32 cindex_id,
 
 void ComputationGraphBuilder::AddInputs() {
   int32 num_added = 0;
-  for (int32 i = 0; i < request_.inputs.size(); i++) {
-    int32 n = nnet_.GetNodeIndex(request_.inputs[i].name);
+  for (int32 i = 0; i < request_->inputs.size(); i++) {
+    int32 n = nnet_.GetNodeIndex(request_->inputs[i].name);
     if (n == -1)
       KALDI_ERR << "Network has no input with name "
-                << request_.inputs[i].name;
+                << request_->inputs[i].name;
     NodeType t = nnet_.GetNode(n).node_type;
     KALDI_ASSERT((t == kInput || t == kComponent) &&
                  "Inputs to graph only allowed for Input and Component nodes.");
 
-    for (int32 j = 0; j < request_.inputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request_.inputs[i].indexes[j]);
+    for (int32 j = 0; j < request_->inputs[i].indexes.size(); j++) {
+      Cindex cindex(n, request_->inputs[i].indexes[j]);
       bool is_input = true, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Input index seems to be listed more than once");
@@ -252,13 +272,13 @@ void ComputationGraphBuilder::AddInputs() {
 
 void ComputationGraphBuilder::AddOutputs() {
   int32 num_added = 0;
-  for (int32 i = 0; i < request_.outputs.size(); i++) {
-    int32 n = nnet_.GetNodeIndex(request_.outputs[i].name);
+  for (int32 i = 0; i < request_->outputs.size(); i++) {
+    int32 n = nnet_.GetNodeIndex(request_->outputs[i].name);
     if (n == -1)
       KALDI_ERR << "Network has no output with name "
-                << request_.outputs[i].name;
-    for (int32 j = 0; j < request_.outputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request_.outputs[i].indexes[j]);
+                << request_->outputs[i].name;
+    for (int32 j = 0; j < request_->outputs[i].indexes.size(); j++) {
+      Cindex cindex(n, request_->outputs[i].indexes[j]);
       bool is_input = false, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Output index seems to be listed more than once");
@@ -328,7 +348,7 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
   KALDI_LOG << num_not_computable << " output cindexes out of "
             << num_outputs_total << " were not computable.";
   std::ostringstream os;
-  request_.Print(os);
+  request_->Print(os);
   KALDI_LOG << "Computation request was: " << os.str();
   if (num_not_computable > num_print)
     KALDI_LOG << "Printing the reasons for " << num_print << " of these.";
@@ -392,7 +412,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       // in the set of inputs to the component that are computable.
       IndexSet index_set(*graph_, computable_info_, node_id - 1, dont_care);
       std::vector<Index> used_indexes;
-      bool ans = c->IsComputable(request_.misc_info, index, index_set,
+      bool ans = c->IsComputable(request_->misc_info, index, index_set,
                                  &used_indexes);
       // If the next assert fails it could be a failure in the assumption that
       // making more inputs available will never change something from not being
@@ -429,8 +449,23 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
   dependencies.swap(used_cindex_ids);
 }
 
-void ComputationGraphBuilder::Compute() {
-  KALDI_ASSERT(current_distance_ == -1 && "Compute() called twice?");
+ComputationGraphBuilder::ComputationGraphBuilder(
+    const Nnet &nnet,
+    ComputationGraph *graph):
+    nnet_(nnet), request_(NULL), graph_(graph),
+    current_distance_(-1) {
+  KALDI_ASSERT(graph_->cindexes.empty() &&
+               "ComputationGraphBuilder initialized with nonempty graph.");
+}
+
+
+void ComputationGraphBuilder::Compute(const ComputationRequest &request) {
+  if (request_ != NULL && graph_->segment_ends.empty()) {
+    // this check is relevant to multi-segment (i.e. online) computations.
+    KALDI_ERR << "You are calling things in the wrong order: should be "
+              << "Compute(), Prune(), Compute, Prune(), ...";
+  }
+  request_ = &request;
   AddInputs();
   AddOutputs();  // sets current_distance_ to 0.
   // max_distance for debugging, to detect infinite recursion.
@@ -449,7 +484,9 @@ void ComputationGraphBuilder::Compute() {
   if (current_distance_ == max_distance)
     KALDI_ERR << "Loop detected while building computation graph (bad "
               << "network topology?)";
-  Check();
+
+  if (RandInt(1, 2 * (graph_->segment_ends.size() + 1)) == 1)
+    Check();
 }
 
 
@@ -531,30 +568,52 @@ void ComputationGraphBuilder::Check() const {
 }
 
 void ComputationGraphBuilder::Prune() {
+  // Since Prune() is called for each segment in turn [note: there
+  // will be only 1 segment in the normal non-online case], we
+  // only prune for the current, just-added segment.
+  int32 start_cindex_id = (graph_->segment_ends.empty() ? 0 :
+                           graph_->segment_ends.back());
   int32 num_cindex_ids = graph_->cindexes.size();
   // Prune the dependencies to just those that are used (to remove
   // optional dependencies that don't end up getting used).
 
-  for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++)
+  for (int32 cindex_id = start_cindex_id;
+       cindex_id < num_cindex_ids; cindex_id++)
     PruneDependencies(cindex_id);
-  depend_on_this_.clear();  // not valid any more after pruning dependencies.
+  // the following clears the elements of depend_on_this from start_cindex_id to
+  // num_cindex_ids - 1, without touching the entire array.
+  depend_on_this_.resize(start_cindex_id);
+  depend_on_this_.resize(num_cindex_ids);
   std::vector<bool> required;
-  ComputeRequiredArray(&required);
+  ComputeRequiredArray(start_cindex_id, &required);
 
-  std::vector<bool> keep(num_cindex_ids, false);
-  for (int32 c = 0; c < num_cindex_ids; c++) {
-    if (required[c] || graph_->is_input[c]) {
+  std::vector<bool> keep(num_cindex_ids - start_cindex_id, false);
+  for (int32 c = start_cindex_id; c < num_cindex_ids; c++) {
+    if (required[c - start_cindex_id] || graph_->is_input[c]) {
       KALDI_ASSERT(computable_info_[c] == kComputable &&
                    "You are calling Prune when not everything is computable.");
-      keep[c] = true;
+      keep[c - start_cindex_id] = true;
     }
   }
-  graph_->Renumber(keep);
-  // The following variables will not be valid any more after the renumbering,
-  // so clear them.
-  computable_info_.clear();
-  computable_queue_.clear();
-  usable_count_.clear();
+  graph_->Renumber(start_cindex_id, keep);
+  // We also need to renumber computable_info_ and usable_count_, which
+  // graph_->Renumber doesn't do for us, but we can make some shortcuts.  We set
+  // all computable_info_ to kComputable because actually it all was kComputable
+  // (we checked when deciding what to keep); and we set the usable_count_ to 1
+  // for all the cindex_ids we just added...  this is not 100% accurate
+  // according to the way we defined usable_count_, but it prevents additional
+  // computation since it is > 0 (notice that IncrementUsableCount and
+  // DecrementUsableCount may do some work when the usable_count goes to zero or
+  // from zero.  Anyway, the usable-count for these cindex_ids for those "older
+  // segments" is not critical.  [this information only gets used if we process
+  // additional segments as part of the compilation of an online computation.]
+  int32 new_num_cindex_ids = graph_->cindexes.size();
+  computable_info_.resize(start_cindex_id);
+  computable_info_.resize(new_num_cindex_ids, (char)kComputable);
+  usable_count_.resize(start_cindex_id);
+  usable_count_.resize(new_num_cindex_ids, 1);
+  KALDI_ASSERT(computable_queue_.empty());
+  graph_->segment_ends.push_back(new_num_cindex_ids);
 }
 
 // Add cindex_ids that this cindex_id depends on.
@@ -584,7 +643,7 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
       int32 c = node.u.component_index;
       const Component *component = nnet_.GetComponent(c);
       std::vector<Index> input_indexes;
-      component->GetInputIndexes(request_.misc_info, index,
+      component->GetInputIndexes(request_->misc_info, index,
                                  &input_indexes);
       input_cindexes.resize(input_indexes.size());
       for (size_t i = 0; i < input_indexes.size(); i++) {
@@ -690,14 +749,14 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
       const int32 input_node_id = node_id - 1;
       {
         IndexSet index_set(*graph_, computable_info_, input_node_id, false);
-        if (c->IsComputable(request_.misc_info, index, index_set, NULL)) {
+        if (c->IsComputable(request_->misc_info, index, index_set, NULL)) {
           // it's computable even without counting kUnknown inputs as computable
           // [treat_unknown_as_computable = false] -> definitely computable.
           return kComputable;
         }
       }
       IndexSet index_set2(*graph_, computable_info_, input_node_id, true);
-      if (!c->IsComputable(request_.misc_info, index, index_set2, NULL)) {
+      if (!c->IsComputable(request_->misc_info, index, index_set2, NULL)) {
         // it's not computable even when counting kUnknown inputs as computable
         // [treat_unknown_as_computable = true] -> definitely not computable.
         return kNotComputable;
@@ -731,9 +790,9 @@ void ComputationGraphBuilder::GetComputableInfo(
   KALDI_ASSERT(!computable_info_.empty() &&
                "You need to call this before Prune()!");
   computable->clear();
-  computable->resize(request_.outputs.size());
-  for (size_t i = 0; i < request_.outputs.size(); i++) {
-    const IoSpecification &output = request_.outputs[i];
+  computable->resize(request_->outputs.size());
+  for (size_t i = 0; i < request_->outputs.size(); i++) {
+    const IoSpecification &output = request_->outputs[i];
     int32 n = nnet_.GetNodeIndex(output.name);
     KALDI_ASSERT(n != -1);
     int32 size = output.indexes.size();
@@ -861,19 +920,26 @@ void ComputationGraphBuilder::BuildGraphOneIter() {
 }
 
 void ComputationGraphBuilder::ComputeRequiredArray(
+    int32 start_cindex_id,
     std::vector<bool> *required) const {
 
   int32 num_cindex_ids = graph_->cindexes.size();
+  KALDI_ASSERT(num_cindex_ids >= start_cindex_id);
   KALDI_ASSERT(computable_info_.size() == num_cindex_ids);
   required->clear();
-  required->resize(num_cindex_ids, false);
+  required->resize(num_cindex_ids - start_cindex_id, false);
+
+  // would be bool, but indexing c++ bool may be slow.
+  std::vector<char> is_output_node(nnet_.NumNodes());
+  for (int32 n = 0; n < nnet_.NumNodes(); n++)
+    is_output_node[n] = (char)(nnet_.IsOutputNode(n) ? 1 : 0);
 
   std::vector<int32> queue;
-  for (int32 c = 0; c < num_cindex_ids; c++) {
+  for (int32 c = start_cindex_id; c < num_cindex_ids; c++) {
     // First put the output cindex_ids into the queue.
     int32 node_id = graph_->cindexes[c].first;
-    if (nnet_.IsOutputNode(node_id)) {
-      (*required)[c] = true;
+    if (is_output_node[node_id]) {
+      (*required)[c - start_cindex_id] = true;
       queue.push_back(c);
     }
   }
@@ -885,16 +951,17 @@ void ComputationGraphBuilder::ComputeRequiredArray(
         end = dependencies.end();
     for (; iter != end; ++iter) {
       int32 d = *iter;
-      if (!(*required)[d]){
-        (*required)[d] = true;
+      if (!(*required)[d - start_cindex_id]){
+        (*required)[d - start_cindex_id] = true;
         queue.push_back(d);
       }
     }
   }
   // just check that we don't have any cindex_ids which are required but have
   // usable_count_ == 0; this would indicate a bug somewhere.
-  for (int32 c = 0; c < num_cindex_ids; c++)
-    KALDI_ASSERT(!((*required)[c] && (usable_count_[c] == 0)));
+  for (int32 c = start_cindex_id; c < num_cindex_ids; c++)
+    KALDI_ASSERT(!((*required)[c - start_cindex_id] &&
+                   (usable_count_[c] == 0)));
 
 }
 
@@ -956,27 +1023,27 @@ void AddInputToGraph(const ComputationRequest &request,
 /**
    This function outputs to dependencies_subset[c], for each cindex_id c,
    the subset of elements d of graph.dependencies[c] such that
-   cindex_id_to_epoch[d] == cindex_id_to_epoch[c].  That is, it's
+   cindex_id_to_segment_and_epoch[d] == cindex_id_to_segment_and_epoch[c].  That is, it's
    the dependency graph of the entire computation, but removing
-   links that go from one epoch to another epoch.  Topologically,
-   'dependencies_subset' would therefor consist of a bunch of
+   links that go from one segment/epoch to another segment/epoch.  Topologically,
+   'dependencies_subset' would therefore consist of a bunch of
    disconnected graphs.
 */
 static void ComputeDependenciesSubset(
     const ComputationGraph &graph,
-    const std::vector<int32> &cindex_id_to_epoch,
+    const std::vector<int32> &cindex_id_to_segment_and_epoch,
     std::vector<std::vector<int32> > *dependencies_subset) {
   int32 num_cindex_ids = graph.cindexes.size();
-  KALDI_ASSERT(cindex_id_to_epoch.size() == num_cindex_ids);
+  KALDI_ASSERT(cindex_id_to_segment_and_epoch.size() == num_cindex_ids);
   dependencies_subset->resize(num_cindex_ids);
   for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++) {
-    int32 phase_index = cindex_id_to_epoch[cindex_id];
+    int32 phase_index = cindex_id_to_segment_and_epoch[cindex_id];
     const std::vector<int32> &dependencies = graph.dependencies[cindex_id];
     std::vector<int32> &dep_subset = (*dependencies_subset)[cindex_id];
     int32 num_dep = dependencies.size();
     for (int32 i = 0; i < num_dep; i++) {
       int32 d = dependencies[i];
-      if (cindex_id_to_epoch[d] == phase_index)
+      if (cindex_id_to_segment_and_epoch[d] == phase_index)
         dep_subset.push_back(d);
     }
   }
@@ -1000,27 +1067,27 @@ static void ComputeDependenciesSubset(
 ///
 ///  \param nnet [in] The neural net
 ///  \param graph [in] The computation graph
-///  \param cindex_id_to_epoch [out] A vector that maps cindex_id to
-///            epoch index, as obtained by adding one to the output of
-///            ComputeNnetComputationOrder; however, input cindex_ids (those for
-///            which is_input[cindex_id] is true) always map to 0.
-///            Note: the epoch-index only depends on the neural network's
-///            topology of nodes; a node in the network should always map to
-///            the same epoch-index regardless of the computation, and
-///            we assign cindexes to epochs just based on what node the
-///            cindexes are part of.
-///  \param epochs [out] The same information as
-///            cindex_id_to_epoch, but in a different format: for each
-///            epoch, a list of cindex_ids with that epoch index.
-///  \param epoch_is_trivial [out] A vector of bool, indexed by
-///            epoch index that's true if this epoch index corresponds
-///            to just a single NetworkNode. (and also true for epoch index 0,
-///            which corresponds only to inputs to the network).
+///  \param cindex_id_to_segment_and_epoch [out] A vector that maps cindex_id to
+///          a number that is the same if two cindex_ids are in the same
+///          segment and same epoch, and different otherwise.  This
+///          number combines the segment index and the epoch index; the
+///          details are not important to the calling code.
+///  \param epochs_per_segment [out]  This is a listing of all the
+///           cindex_ids in the computation graph, divided up first
+///           by segment and then by epoch.
+///  \param epoch_is_trivial [out] A vector of bool, indexed by the epoch
+///           index which is the same as the second index of
+///           'epochs_per_segment', that's true if this epoch index corresponds
+///           to just a single NetworkNode (and also true for epoch indexes
+///           corresponding to inputs to the network, which will be the first
+///           epoch of each segment).  This depends on the neural network
+///           structure only.
+
 static void ComputeEpochInfo(
     const Nnet &nnet,
     const ComputationGraph &graph,
-    std::vector<int32> *cindex_id_to_epoch,
-    std::vector<std::vector<int32 > > *epochs,
+    std::vector<int32> *cindex_id_to_segment_and_epoch,
+    std::vector<std::vector<std::vector<int32 > > > *epochs_per_segment,
     std::vector<bool> *epoch_is_trivial) {
 
   // node_to_epoch maps each nnet node to an index >= 0 that tells us coarsely
@@ -1041,10 +1108,14 @@ static void ComputeEpochInfo(
     node_to_epoch[i]++;
   int32 num_nodes = nnet.NumNodes(),
       num_cindex_ids = graph.cindexes.size(),
+      num_segments = graph.segment_ends.size(),
       num_epoch_indexes = 1 + *std::max_element(node_to_epoch.begin(),
                                                 node_to_epoch.end());
   KALDI_ASSERT(node_to_epoch.size() == num_nodes);
 
+  epochs_per_segment->clear();
+  epochs_per_segment->resize(num_segments);
+
   // epoch_to_num_nodes is only used so we know whether each epoch
   // index corresponds to multiple nodes; if it's just one node then we know
   // the computation is very simple and we can do an optimization.
@@ -1057,15 +1128,24 @@ static void ComputeEpochInfo(
     KALDI_ASSERT(o == 0 || epoch_to_num_nodes[o] > 0);
     (*epoch_is_trivial)[o] = (epoch_to_num_nodes[o] <= 1);
   }
-
-  cindex_id_to_epoch->resize(num_cindex_ids);
-  epochs->resize(num_epoch_indexes);
-  for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++) {
-    int32 node_index = graph.cindexes[cindex_id].first,
-        epoch_index = (graph.is_input[cindex_id] ? 0 :
-                             node_to_epoch[node_index]);
-    (*cindex_id_to_epoch)[cindex_id] = epoch_index;
-    (*epochs)[epoch_index].push_back(cindex_id);
+  cindex_id_to_segment_and_epoch->resize(num_cindex_ids);
+  KALDI_ASSERT(graph.segment_ends.back() == num_cindex_ids);
+  int32 cur_segment_start = 0, cur_segment_end;
+  for (int32 segment = 0; segment < num_segments; segment++) {
+    cur_segment_end = graph.segment_ends[segment];
+    std::vector<std::vector<int32> > &epochs = (*epochs_per_segment)[segment];
+    epochs.resize(num_epoch_indexes);
+
+    for (int32 cindex_id = cur_segment_start;
+         cindex_id < cur_segment_end; cindex_id++) {
+      int32 node_index = graph.cindexes[cindex_id].first,
+          epoch_index = (graph.is_input[cindex_id] ? 0 :
+                         node_to_epoch[node_index]);
+      (*cindex_id_to_segment_and_epoch)[cindex_id] =
+          epoch_index + segment * num_epoch_indexes;
+      epochs[epoch_index].push_back(cindex_id);
+    }
+    cur_segment_start = cur_segment_end;
   }
 }
 
@@ -1168,6 +1248,14 @@ static int32 SumVectorSizes(const std::vector<std::vector<int32> > &vec) {
   return ans;
 }
 
+static int32 SumVectorSizes(const std::vector<std::vector<std::vector<int32> > > &vec) {
+  int32 ans = 0;
+  for (size_t i = 0; i < vec.size(); i++)
+    ans += SumVectorSizes(vec[i]);
+  return ans;
+}
+
+
 /*
   this function is called from ComputeComputationPhases; it handles the part of
   the computation from one epoch (this code was broken out to avoid that
@@ -1187,10 +1275,11 @@ static int32 SumVectorSizes(const std::vector<std::vector<int32> > &vec) {
                           in things like TDNNs.
   @param [in] dependencies_subset  A subset of 'graph.dependencies' corresponding
                           just to dependencies within the same epoch (not specifically
-                          this epoch; for all epochs).  E.g. for a cindex_id c
+                          this epoch; for all epochs).  In general, for a cindex_id c
                           dependencies[c] is a list of other cindex_ids d1, d2,
                           such that in order to compute c we must first compute
-                          d1, d2 and so on.
+                          d1, d2 and so on (plus d1, d2, etc. must be from the
+                          same epoch as c).
   @param [in] depends_on_subset  The graph-transpose of dependencies_subset;
                           for cindex_id c, depends_on_subset[c] is the list
                           of cindex_ids that directly depend on cindex_id c,
@@ -1198,26 +1287,26 @@ static int32 SumVectorSizes(const std::vector<std::vector<int32> > &vec) {
   @param [in] epoch_is_trivial  A bool that's true if this epoch is trivial
                           (meaning it consists of just one component)... this
                           enables a faster code path in this common case.
-  @param [in,out] phase_indexes  This vector, to some elements of which this function writes
-                          each time it is called, maps from cindex_id to the
-                          'phase index'.  A phase index is a number identifying
-                          the phases [like coarse steps] of the computation, with
-                          zero for the first phase, one for the second, etc.
-                          We work out how many phase indexes have been used already
-                          by previous epochs, from phases->size().  Actually,
-                          phase_indexes is really just a temporary variable used
-                          by this function, that we allocate outside this
-                          function for efficiency.  It is initialized to
-                          -1 outside this function; different invocations of
-                          this function work with different elements of the
-                          vector.
-  @param [in,out] phases  This is the output of this function.  Each time
-                          we add a new phase, we append a vector to *phases.
-                          E.g. (*phases)[0] is the sorted list of cindexes
-                          in the first phase of the computation... and so on.
-                          Note, this function is called multiple times, and
-                          each time we add one or more phases to this vector,
-                          so its size grows.
+  @param [in,out] phase_indexes  This vector, to some elements of which this
+                          function writes each time it is called, maps from
+                          cindex_id to the 'phase index'.  A phase index is a
+                          number identifying the phases [like coarse steps] of
+                          the computation, with zero for the first phase, one
+                          for the second, etc.  We work out how many phase
+                          indexes have been used already by previous epochs,
+                          from phases->size().  Actually, phase_indexes is
+                          really just a temporary variable used by this
+                          function, that we allocate outside this function for
+                          efficiency.  It is initialized to -1 outside this
+                          function; different invocations of this function work
+                          with different non-overlapping elements of the vector.
+                          @param [in,out] phases This is the output of this
+                          function.  Each time we add a new phase, we append a
+                          vector to *phases.  E.g. (*phases)[0] is the sorted
+                          list of cindexes in the first phase of the
+                          computation... and so on.  Note, this function is
+                          called multiple times, and each time we add one or
+                          more phases to this vector, so its size grows.
 */
 static inline void ComputeComputationPhasesForEpoch(
     const Nnet &nnet,
@@ -1321,17 +1410,17 @@ static inline void ComputeComputationPhasesForEpoch(
 void ComputeComputationPhases(
     const Nnet &nnet,
     const ComputationGraph &graph,
-    std::vector<std::vector<int32> > *phases) {
+    std::vector<std::vector<std::vector<int32> > > *phases_per_segment) {
   using namespace computation_graph;
   int32 num_cindex_ids = graph.cindexes.size();
 
-  std::vector<int32> cindex_id_to_epoch;
-  std::vector<std::vector<int32 > > epochs;
+  std::vector<int32> cindex_id_to_segment_and_epoch;
+  std::vector<std::vector<std::vector<int32 > > > epochs_per_segment;
   std::vector<bool> epoch_is_trivial;
-  ComputeEpochInfo(nnet, graph, &cindex_id_to_epoch,
-                   &epochs, &epoch_is_trivial);
+  ComputeEpochInfo(nnet, graph, &cindex_id_to_segment_and_epoch,
+                   &epochs_per_segment, &epoch_is_trivial);
 
-  KALDI_ASSERT(SumVectorSizes(epochs) == num_cindex_ids);
+  KALDI_ASSERT(SumVectorSizes(epochs_per_segment) == num_cindex_ids);
 
   // dependencies_subset contains just the subset of dependencies
   // of each cindex_id, that have the same epoch index as
@@ -1339,8 +1428,10 @@ void ComputeComputationPhases(
   // cindexes within a certain epoch (relevant for things like
   // LSTMs).
   std::vector<std::vector<int32> > dependencies_subset;
-  ComputeDependenciesSubset(graph, cindex_id_to_epoch,
+  ComputeDependenciesSubset(graph, cindex_id_to_segment_and_epoch,
                             &dependencies_subset);
+  // destroy cindex_id_to_segment_and_epoch, it's no longer needed.
+  { std::vector<int32> temp; temp.swap(cindex_id_to_segment_and_epoch);  }
 
   // depend_on_subset is a subset of the normal "depend_on" list (i.e. a list of
   // all cindex_ids that depend on the current cindex_id), limited to just those
@@ -1348,31 +1439,32 @@ void ComputeComputationPhases(
   std::vector<std::vector<int32> > depend_on_subset;
   ComputeGraphTranspose(dependencies_subset, &depend_on_subset);
 
-  int32 num_epoch_indexes = epoch_is_trivial.size();
+  int32 num_epoch_indexes = epoch_is_trivial.size(),
+      num_segments = graph.segment_ends.size();
 
   // "phase_indexes" is used inside ComputeComputationPhasesForEpoch.
   std::vector<int32> phase_indexes(num_cindex_ids, -1);
 
-  if (phases) {
-    phases->clear();
-    phases->reserve(50);  // minimize unnecessary copies.  50 is very
-                            // arbitrarily chosen.
+  phases_per_segment->clear();
+  phases_per_segment->resize(num_segments);
+
+  for (int32 segment = 0; segment < num_segments; segment++) {
+    phases_per_segment->reserve(50);  // minimize unnecessary copies.  50 is
+                                      // very arbitrarily chosen.
+    for (int32 epoch = 0; epoch < num_epoch_indexes; epoch++)
+      ComputeComputationPhasesForEpoch(nnet, graph,
+                                       epochs_per_segment[segment][epoch],
+                                       dependencies_subset,
+                                       depend_on_subset,
+                                       epoch_is_trivial[epoch],
+                                       &phase_indexes,
+                                       &((*phases_per_segment)[segment]));
   }
 
-  for (int32 epoch = 0;
-       epoch < num_epoch_indexes;
-       epoch++)
-    ComputeComputationPhasesForEpoch(nnet, graph,
-                                     epochs[epoch],
-                                     dependencies_subset,
-                                     depend_on_subset,
-                                     epoch_is_trivial[epoch],
-                                     &phase_indexes, phases);
-
 
   // make sure everything was computable.  If the next assert fails it's likely
   // a bug in this function or in PruneComputataionGraph.
-  KALDI_ASSERT(SumVectorSizes(*phases) == num_cindex_ids);
+  KALDI_ASSERT(SumVectorSizes(*phases_per_segment) == num_cindex_ids);
 }
 
 CindexSet::CindexSet(const ComputationGraph &graph):
@@ -1835,7 +1927,6 @@ void ComputeComputationSteps(
     ComputationGraph *graph,
     std::vector<std::vector<int32> > *steps) {
   using namespace compute_computation_steps;
-  steps->clear();
   AddInputSteps(nnet, request, *graph, steps);
   {
     std::vector<std::vector<int32> > component_steps;
@@ -1847,16 +1938,9 @@ void ComputeComputationSteps(
   ReorderIndexes(nnet, request, *graph, steps);
   AddDimRangeSteps(nnet, graph, steps);
   AddOutputSteps(nnet, request, *graph, steps);
-
-  int32 num_cindexes = 0;
-  for (int32 i = 0; i < steps->size(); i++)
-    num_cindexes += (*steps)[i].size();
-  // The next line has ">=" not "==" because it is possible (although unlikely
-  // in normal setups) that some cindexes of Descriptors which are at the inputs
-  // of Components,
-  KALDI_ASSERT(num_cindexes >= graph->cindexes.size());
 }
 
 
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-computation-graph.h b/src/nnet3/nnet-computation-graph.h
index 660e20d36ad..41087123421 100644
--- a/src/nnet3/nnet-computation-graph.h
+++ b/src/nnet3/nnet-computation-graph.h
@@ -62,6 +62,23 @@ struct ComputationGraph {
   /// those that are used (which will vary depending on availability).
   std::vector<std::vector<int32> > dependencies;
 
+  /// This variable is only of particular interest in a 'multi-segment'
+  /// computation, which is used while creating computations for 'online'
+  /// operation (for the kind of situation where you provide some input; run the
+  /// computation; get some output, provide some more input for larger 't'
+  /// values, etc.).  In this context, a 'segment' is a continuous range of
+  /// cindex_ids, and a segment_end is one past the end of each segment, which
+  /// is the same as the beginning of the next segment, if there is one.  In the
+  /// case of a fully-created computation graph with only one segment, this will
+  /// contain just one value which equals the number of cindex_ids.
+  /// This information is needed to correctly order the computation, because
+  ///
+  /// the computation graph itself does not contain dependencies that encode the
+  /// ordering of segments (and even if it did contain those dependencies, it's
+  /// not really compatible with the way we use the scc's in the graph structure
+  /// of the network to order the computation).
+  std::vector<int32> segment_ends;
+
   /// Maps a Cindex to an integer cindex_id.  If not present, then add it (with
   /// the corresponding "is_input" flag set to the value "input") and set
   /// *is_new to true.  If present, set is_new to false and return the existing
@@ -72,9 +89,12 @@ struct ComputationGraph {
   /// -1 if the Cindex is not present, and the user should check for this.
   int32 GetCindexId(const Cindex &cindex) const;
 
-  /// This function renumbers the cindex-ids, keeping only for which keep[c] is
-  /// true.  The "keep" array must be the same size as this->cindexes.
-  void Renumber(const std::vector<bool> &keep);
+  /// This function renumbers the cindex-ids (but only those with index c >= start_cindex_id,
+  // keeping only for which keep[c - start_cindex_id] is
+  /// true.  The "keep" array must be the same size as this->cindexes.size() -
+  /// start_cindex_id.
+  void Renumber(int32 start_cindex_id,
+                const std::vector<bool> &keep);
 
 
   /// This function, useful for debugging/visualization purposes,
@@ -97,13 +117,18 @@ struct ComputationGraph {
 class ComputationGraphBuilder {
  public:
   ComputationGraphBuilder(const Nnet &nnet,
-                          const ComputationRequest &request,
-                          ComputationGraph *graph):
-      nnet_(nnet), request_(request), graph_(graph), current_distance_(-1) { }
-
-  // Does the initial computation (populating the graph and computing
-  // whether each required cindex_id is computable), without the pruning.
-  void Compute();
+                          ComputationGraph *graph);
+
+  // Does the initial computation (populating the graph and computing whether
+  // each required cindex_id is computable), without the pruning.  In the normal
+  // case you call this just once with one 'request', but in the 'online' case
+  // you call Compute() [then maybe check AllOutputsAreComputable()] then
+  // Prune() multiple times, with a sequence of different requests for
+  // increasing time values.
+  // Note: it sets the class member request_ to the address of 'request', so
+  // you should not let 'request' go out of scope while this class might
+  // still use it (e.g. until you call Compute() with a different
+  void Compute(const ComputationRequest &request);
 
   // Returns true if all requested outputs are computable.  To be called after
   // Compute() but before Prune(().
@@ -211,14 +236,18 @@ class ComputationGraphBuilder {
   // PruneDependencies() to remove unused dependencies, so it will only say
   // something is required if it is really accessed in the computation.
   // We'll later use this to remove unnecessary cindexes.
-  void ComputeRequiredArray(std::vector<bool> *required) const;
+  // 'start_cindex_id' is the cindex_id from which the 'required' array is
+  // to start (normally zero, but may be nonzero in multi-segment computations);
+  // so 'required' is indexed by cindex_id - start_cindex_id.
+  void ComputeRequiredArray(int32 start_cindex_id,
+                            std::vector<bool> *required) const;
 
   // this function, to be called from Compute(), does some sanity checks to
   // verify that the internal state is consistent.
   void Check() const;
 
   const Nnet &nnet_;
-  const ComputationRequest &request_;
+  const ComputationRequest *request_;
   ComputationGraph *graph_;
 
   // this is the transpose of graph_->dependencies; it tells us
@@ -248,7 +277,7 @@ class ComputationGraphBuilder {
   std::vector<int32> usable_count_;
 
   // current_distance_ >= 0 is the distance to the output, of the cindex_ids in
-  // current_queue_;
+  // current_queue_.
   int32 current_distance_;
   // the cindex_ids in current_queue_ are at distance "current_distance" to the
   // output and have not yet had their dependencies processed.
@@ -322,23 +351,29 @@ class IndexSet {
 
    @param [in] nnet  The neural network this computation is for
    @param [in] graph  The computation graph that we're computing phases for.
-   @param [out] phases  The phases.  Suppose the computation can be completed
-                       in 20 phases, then phases->size() will be 20 at exit, and
-                       (*phases)[0] will be a sorted list of cindex_ids.  that
-                       belong to the first phase, and so on. (Remember, a
-                       cindex_id is an index into graph->cindexes; it compactly
-                       identifies a cindex.)  The sets represented by the
-                       elements of 'phases' will be disjoint and will cover all
-                       elements in [0 .. computation.cindexes.size() - 1].
-
-                       This function will be crash if the computation cannot
-                       actualy be computed.  Note: we assume you have called
-                       PruneComputationGraph() before this function.
+   @param [out] phases_per_segment  The phases, listed separately for each
+                segment of the computation [there will be just one segment in
+                the normal case, more in the online-recognition case].  Consider
+                just one segment for now.  Suppose the computation can be
+                completed in 20 phases, then (*phases)[0].size() will be 20 at
+                exit, and (*phases)[0][0] will be a sorted list of cindex_ids.
+                that belong to the first phase, and so on. (Remember, a
+                cindex_id is an index into graph->cindexes; it compactly
+                identifies a cindex.)  The sets represented by the int32's in
+                'phases_per_segment' will be disjoint and will cover all
+                elements in [0 .. computation.cindexes.size() - 1].
+
+                Note: we assume you have called PruneComputationGraph() before
+                this function.  Even so, this function will be crash if the
+                computation cannot actually be computed-- there are some
+                mal-formed computations where you can build the computation graph
+                but not the ordering of cindexes because there are dependencies
+                forward and backward in time that intertwine.
 */
 void ComputeComputationPhases(
     const Nnet &nnet,
     const ComputationGraph &computation_graph,
-    std::vector<std::vector<int32> > *phases);
+    std::vector<std::vector<std::vector<int32> > > *phases_per_segment);
 
 
 /**
@@ -351,9 +386,9 @@ void ComputeComputationPhases(
   - All cindex_ids within a given step correspond to the same node in the graph
   - All dependencies of cindex_ids within a given step have been computed in
     earlier steps.
-  .
-There are also some extra, more obscure properties that the sequence of steps
-must satisfy:
+
+ There are also some extra, more obscure properties that the sequence of steps
+ must satisfy:
   - Any input or output in the ComputationRequest must be in one step, with the
     Indexes in the same order as specified in the ComputationRequest.  (Note:
     inputs can be for nodes of type kComponent as well as kInput).
@@ -366,8 +401,8 @@ must satisfy:
     Indexes appearing in the same order.  (This lets us use a sub-matrix for
     the kDimRange node).
 
-The reason why computation_graph is not provided as a const argument is
-that in order to ensure the final property we may have to add a few new cindex_ids.
+ The reason why computation_graph is not provided as a const argument is that in
+ order to ensure the final property we may have to add a few new cindex_ids.
 */
 void ComputeComputationSteps(
     const Nnet &nnet,
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 3c6d7e21bd9..f2759d7705d 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -689,8 +689,9 @@ void ConsolidateIoOperations(const Nnet &nnet,
     reordered_commands[segments[s].second].command_type = kNoOperationMarker;
 
   // for each segment we'll divide the commands up into those that must appear
-  // at the left (start) of the segment, those that must appear in the middle
-  // and those that must appear at the right (end).
+  // at the left of the segment (kAcceptInput for inputs and output-derivs), those
+  // that must appear in the middle (most commands), those that must appear
+  // on the right (kProvideOutput for output nodes and input derivatives).
   std::vector<int32> left_commands, middle_commands, right_commands;
 
   for (size_t s = 0; s < segments.size(); s++) {
@@ -700,11 +701,9 @@ void ConsolidateIoOperations(const Nnet &nnet,
     middle_commands.clear();
     right_commands.clear();
     for (int32 c = segment_start; c < segment_end; c++) {
-      if (computation->commands[c].command_type == kProvideOutput &&
-          nnet.IsInputNode(computation->commands[c].arg2)) {
+      if (computation->commands[c].command_type == kProvideOutput) {
         right_commands.push_back(c);
-      } else if (computation->commands[c].command_type == kProvideOutput ||
-                 computation->commands[c].command_type == kAcceptInput) {
+      } else if (computation->commands[c].command_type == kAcceptInput) {
         left_commands.push_back(c);
       } else {
         middle_commands.push_back(c);
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 520fe3d34a9..f38e4d854ff 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -298,12 +298,9 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
                                  NnetComputation *computation);
 
 
-/// This optimization puts the I/O operations (kAcceptInput and kProvideOutput
-/// at the very beginning or end of segments of computation.  Specifically:
-/// first the computation is broken up into segments delimited by kNoOperationMarker.
-/// Then, for each segment, all I/O operations are moved to the start of the segment,
-/// *except for* kProvideOutput for inpu nodes (where the network provides an
-/// input-deriv), which is moved to the end of the segment.
+/// This optimization puts the input operations (kAcceptInput) and output
+/// operations (kProvideOutput) at the very beginning or end of segments of
+/// computation, respectively.
 void ConsolidateIoOperations(const Nnet &nnet,
                              NnetComputation *computation);
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 3d4330ac9f3..999789650b5 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -68,8 +68,8 @@ void EvaluateComputationRequest(
     const ComputationRequest &request,
     std::vector<std::vector<bool> > *is_computable) {
   ComputationGraph graph;
-  ComputationGraphBuilder builder(nnet, request, &graph);
-  builder.Compute();
+  ComputationGraphBuilder builder(nnet, &graph);
+  builder.Compute(request);
   builder.GetComputableInfo(is_computable);
   if (GetVerboseLevel() >= 4) {
     std::ostringstream graph_pretty;

From c5f441be43c6af8b7d924331e8f4cf059f7726e2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 10 Oct 2016 21:02:54 -0400
Subject: [PATCH 009/213] Fix a few bugs shown up by valgrind testing

---
 src/chainbin/nnet3-chain-acc-lda-stats.cc     |  4 +--
 src/nnet3/nnet-compile-utils-test.cc          | 34 ++++++++++---------
 src/nnet3/nnet-component-test.cc              | 14 ++++----
 src/nnet3/nnet-computation.cc                 |  5 +++
 src/nnet3bin/nnet3-acc-lda-stats.cc           |  2 +-
 src/nnet3bin/nnet3-compute-from-egs.cc        | 16 ++++-----
 .../nnet3-discriminative-compute-from-egs.cc  | 17 ++++------
 7 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc
index 3f092879b6e..b195f5ba1fb 100644
--- a/src/chainbin/nnet3-chain-acc-lda-stats.cc
+++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc
@@ -54,7 +54,7 @@ class NnetChainLdaStatsAccumulator {
     NnetComputer computer(options, computation, nnet_, NULL);
 
     computer.AcceptInputs(nnet_, eg.inputs);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     AccStatsFromOutput(eg, nnet_output);
   }
@@ -202,5 +202,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3/nnet-compile-utils-test.cc b/src/nnet3/nnet-compile-utils-test.cc
index e5c9e24cc46..53820abf32a 100644
--- a/src/nnet3/nnet-compile-utils-test.cc
+++ b/src/nnet3/nnet-compile-utils-test.cc
@@ -71,10 +71,10 @@ void UnitTestSplitLocationsBackward(bool verbose) {
   int32 minibatch_size = Rand() % 1024 + 100;
   int32 num_submat_indexes = Rand() % 10 + 1;
   int32 max_submat_list_size = Rand() % 10 + 1;
-  int32 min_num_kAddRows = Rand() % 2; // minimum number of kAddRows compatible
+  int32 min_num_kaddrows = Rand() % 2; // minimum number of kAddRows compatible
   // lists expected in the final split lists. This value will be used to
   // create input submat_lists so that this is guaranteed
-  max_submat_list_size = min_num_kAddRows + max_submat_list_size;
+  max_submat_list_size = min_num_kaddrows + max_submat_list_size;
 
   std::vector<std::pair<int32, int32> > all_pairs;
   all_pairs.reserve(minibatch_size * max_submat_list_size);
@@ -95,8 +95,8 @@ void UnitTestSplitLocationsBackward(bool verbose) {
         num_locations : max_generated_submat_list_size;
     submat_lists[i].reserve(num_locations);
     for (int32 j = 0; j < num_locations; j++) {
-      if (j <= min_num_kAddRows)
-        // since we need min_num_kAddRows in the split_lists we ensure that
+      if (j <= min_num_kaddrows)
+        // since we need min_num_kaddrows in the split_lists we ensure that
         // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
                            Rand() % minibatch_size));
@@ -148,7 +148,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
     PrintVectorVectorPair(split_lists);
     KALDI_LOG << "===========================";
   }
-  int32 num_kAddRows_in_output = 0;
+  int32 num_kaddrows_in_output = 0;
   int32 first_value;
   std::vector<int32> second_values;
   // ensure that elements in submat_lists are also present
@@ -163,7 +163,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
           KALDI_ASSERT((split_lists[i][j].first == first_value) &&
                        (split_lists[i][j].second == second_values[j]));
       }
-      num_kAddRows_in_output++;
+      num_kaddrows_in_output++;
     }
     for (int32 j = 0; j < split_lists[i].size(); j++) {
       if (split_lists[i][j].first == -1)
@@ -178,7 +178,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
   KALDI_ASSERT(all_pairs.size() == 0);
   // ensure that there are at least as many kAddRows compatible split_lists as
   // specified
-  KALDI_ASSERT(num_kAddRows_in_output >= min_num_kAddRows);
+  KALDI_ASSERT(num_kaddrows_in_output >= min_num_kaddrows);
 }
 
 
@@ -276,10 +276,10 @@ void UnitTestSplitLocations(bool verbose) {
   int32 minibatch_size = Rand() % 1024 + 100;
   int32 num_submat_indexes = Rand() % 10 + 1;
   int32 max_submat_list_size = Rand() % 10 + 1;
-  int32 min_num_kAddRows = Rand() % 2; // minimum number of kAddRows compatible
+  int32 min_num_kaddrows = Rand() % 2; // minimum number of kAddRows compatible
   // lists expected in the final split lists. This value will be used to
   // create input submat_lists so that this is guaranteed
-  max_submat_list_size = min_num_kAddRows + max_submat_list_size;
+  max_submat_list_size = min_num_kaddrows + max_submat_list_size;
 
   std::vector<std::pair<int32, int32> > all_pairs;
   all_pairs.reserve(minibatch_size * max_submat_list_size);
@@ -300,12 +300,14 @@ void UnitTestSplitLocations(bool verbose) {
         num_locations : max_generated_submat_list_size;
     submat_lists[i].reserve(num_locations);
     for (int32 j = 0; j < num_locations; j++) {
-      if (j <= min_num_kAddRows)
-        // since we need min_num_kAddRows in the split_lists we ensure that
+      // note from dan: I edited the following line to resolve a valgrind error
+      // but cannot really understand at this point what this code is doing.
+      if (j <= min_num_kaddrows && j < num_submat_indexes) {
+        // since we need min_num_kaddrows in the split_lists we ensure that
         // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
-                           Rand() % minibatch_size));
-
+                                                 Rand() % minibatch_size));
+      }
       submat_lists[i].push_back(
           std::make_pair(submat_indexes[Rand() % num_submat_indexes],
                          Rand() % minibatch_size));
@@ -323,7 +325,7 @@ void UnitTestSplitLocations(bool verbose) {
     KALDI_LOG << "===========================";
     KALDI_LOG << split_lists.size();
   }
-  int32 num_kAddRows_in_output = 0;
+  int32 num_kaddrows_in_output = 0;
   int32 first_value;
   std::vector<int32> second_values;
   // ensure that elements in submat_lists are also present
@@ -337,7 +339,7 @@ void UnitTestSplitLocations(bool verbose) {
           KALDI_ASSERT((split_lists[i][j].first == first_value) &&
                        (split_lists[i][j].second == second_values[j]));
       }
-      num_kAddRows_in_output++;
+      num_kaddrows_in_output++;
     }
     for (int32 j = 0; j < split_lists[i].size(); j++) {
       if (split_lists[i][j].first == -1)
@@ -352,7 +354,7 @@ void UnitTestSplitLocations(bool verbose) {
   KALDI_ASSERT(all_pairs.size() == 0);
   // ensure that there are at least as many kAddRows compatible split_lists as
   // specified
-  KALDI_ASSERT(num_kAddRows_in_output >= min_num_kAddRows);
+  KALDI_ASSERT(num_kaddrows_in_output >= min_num_kaddrows);
 }
 
 } // namespace nnet2
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index 3cc6af1c70d..1cb96563b77 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -25,9 +25,9 @@ namespace kaldi {
 namespace nnet3 {
 // Reset seeds for test time for RandomComponent
 static void ResetSeed(int32 rand_seed, const Component &c) {
-  RandomComponent *rand_component = 
+  RandomComponent *rand_component =
     const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&c));
-  
+
   if (rand_component != NULL) {
     srand(rand_seed);
     rand_component->ResetGenerator();
@@ -48,8 +48,10 @@ static bool StringsApproxEqual(const std::string &a,
       // if it's not the last digit in the string, goto fail
       if (pos + 1 != size && isdigit(a[pos+1]))
         goto fail;
+      if (pos == 0)
+        goto fail;
       size_t pos2;
-      for (pos2 = pos - 1; pos2 > 0; pos2--) {
+      for (pos2 = static_cast<ssize_t>(pos) - 1; pos2 > 0; pos2--) {
         if (a[pos2] == '.') break;  // we accept this difference: we went backwards and found a '.'
         if (!isdigit(a[pos2]))  // we reject this difference: we went back and
                                 // found non-digit before '.' -> not floating
@@ -198,7 +200,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
   int32 properties = c.Properties();
   Component *c_copy = NULL, *c_copy_scaled = NULL;
   int32 rand_seed = Rand();
- 
+
   if (RandInt(0, 1) == 0)
     c_copy = c.Copy();  // This will test backprop with an updatable component.
   if (RandInt(0, 1) == 0 &&
@@ -234,7 +236,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
   if ((properties & kPropagateAdds) && (properties & kPropagateInPlace)) {
     KALDI_ERR << "kPropagateAdds and kPropagateInPlace flags are incompatible.";
   }
-  
+
   ResetSeed(rand_seed, c);
   c.Propagate(NULL, input_data, &output_data1);
 
@@ -327,7 +329,7 @@ bool TestSimpleComponentDataDerivative(const Component &c,
       output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   input_data.SetRandn();
   output_deriv.SetRandn();
- 
+
   ResetSeed(rand_seed, c);
   c.Propagate(NULL, input_data, &output_data);
 
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 30dbaa94256..c58fb87dde4 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -711,6 +711,11 @@ void NnetComputation::Read(std::istream &is, bool binary) {
   }
 
 
+  // delete any existing pointers in component_precomputed_indexes.
+  for (size_t i = 0; i < component_precomputed_indexes.size(); i++)
+    delete component_precomputed_indexes[i];
+  component_precomputed_indexes.clear();
+
   size_t num_component_precomputed_indexes;
   ExpectToken(is, binary, "<NumComponentPrecomputedIndexes>");
   ReadBasicType(is, binary, &num_component_precomputed_indexes);
diff --git a/src/nnet3bin/nnet3-acc-lda-stats.cc b/src/nnet3bin/nnet3-acc-lda-stats.cc
index 0b3b537855e..c8911a4a39f 100644
--- a/src/nnet3bin/nnet3-acc-lda-stats.cc
+++ b/src/nnet3bin/nnet3-acc-lda-stats.cc
@@ -46,7 +46,7 @@ class NnetLdaStatsAccumulator {
     NnetComputer computer(options, computation, nnet_, NULL);
 
     computer.AcceptInputs(nnet_, eg.io);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     AccStatsFromOutput(eg, nnet_output);
   }
diff --git a/src/nnet3bin/nnet3-compute-from-egs.cc b/src/nnet3bin/nnet3-compute-from-egs.cc
index 66eace0dab5..648b5e1408f 100644
--- a/src/nnet3bin/nnet3-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-compute-from-egs.cc
@@ -46,7 +46,7 @@ class NnetComputerFromEg {
       options.debug = true;
     NnetComputer computer(options, computation, nnet_, NULL);
     computer.AcceptInputs(nnet_, eg.io);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
     nnet_output.CopyToMat(output);
@@ -54,7 +54,7 @@ class NnetComputerFromEg {
  private:
   const Nnet &nnet_;
   CachingOptimizingCompiler compiler_;
-  
+
 };
 
 }
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
         "e.g.:\n"
         "nnet3-compute-from-egs --apply-exp=true 0.raw ark:1.egs ark:- | matrix-sum-rows ark:- ... \n"
         "See also: nnet3-compute\n";
-    
+
     bool binary_write = true,
         apply_exp = false;
     std::string use_gpu = "yes";
@@ -89,7 +89,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
-    
+
     std::string nnet_rxfilename = po.GetArg(1),
         examples_rspecifier = po.GetArg(2),
         matrix_wspecifier = po.GetArg(3);
@@ -109,10 +109,10 @@ int main(int argc, char *argv[]) {
     NnetComputerFromEg computer(nnet);
 
     int64 num_egs = 0;
-    
+
     SequentialNnetExampleReader example_reader(examples_rspecifier);
     BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
-    
+
     for (; !example_reader.Done(); example_reader.Next(), num_egs++) {
       Matrix<BaseFloat> output;
       computer.Compute(example_reader.Value(), &output);
@@ -131,5 +131,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
index 7736290d1d5..d8b0f469beb 100644
--- a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
@@ -46,7 +46,7 @@ class NnetComputerFromEg {
       options.debug = true;
     NnetComputer computer(options, computation, nnet_, NULL);
     computer.AcceptInputs(nnet_, eg.io);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
     nnet_output.CopyToMat(output);
@@ -54,7 +54,7 @@ class NnetComputerFromEg {
  private:
   const Nnet &nnet_;
   CachingOptimizingCompiler compiler_;
-  
+
 };
 
 }
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
         "e.g.:\n"
         "nnet3-discriminative-compute-from-egs --apply-exp=true 0.raw ark:1.degs ark:- | matrix-sum-rows ark:- ... \n"
         "See also: nnet3-compute nnet3-compute-from-egs\n";
-    
+
     bool binary_write = true,
         apply_exp = false;
     std::string use_gpu = "yes";
@@ -93,7 +93,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
@@ -102,7 +102,7 @@ int main(int argc, char *argv[]) {
 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
-    
+
     std::string nnet_rxfilename = po.GetArg(1),
         examples_rspecifier = po.GetArg(2),
         matrix_wspecifier = po.GetArg(3);
@@ -113,10 +113,10 @@ int main(int argc, char *argv[]) {
     NnetComputerFromEg computer(nnet);
 
     int64 num_egs = 0;
-    
+
     SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
     BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
-    
+
     for (; !example_reader.Done(); example_reader.Next(), num_egs++) {
       Matrix<BaseFloat> output;
       NnetExample eg;
@@ -146,6 +146,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
-

From cab8420abb6cd50e3b4bf64884425f9518103d27 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 15 Oct 2016 15:29:21 -0400
Subject: [PATCH 010/213] Refactoring generation of the computation 'steps' for
 more clarity and to allow online computation.  Add a basic test for
 multi-segment computation.

---
 src/nnet3/nnet-compile-test.cc      |  62 ++-
 src/nnet3/nnet-compile.cc           |  59 +--
 src/nnet3/nnet-compile.h            |   3 -
 src/nnet3/nnet-computation-graph.cc | 778 +++++++++++++++-------------
 src/nnet3/nnet-computation-graph.h  | 159 +++++-
 src/nnet3/nnet-test-utils.cc        |   2 +
 src/nnet3/nnet-test-utils.h         |   2 +
 7 files changed, 630 insertions(+), 435 deletions(-)

diff --git a/src/nnet3/nnet-compile-test.cc b/src/nnet3/nnet-compile-test.cc
index d405fd0f5fa..c0e1b6f8b5b 100644
--- a/src/nnet3/nnet-compile-test.cc
+++ b/src/nnet3/nnet-compile-test.cc
@@ -28,7 +28,6 @@ namespace nnet3 {
 void UnitTestNnetCompile() {
   for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
-
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -56,6 +55,66 @@ void UnitTestNnetCompile() {
   }
 }
 
+
+// this tests compilation where there are more than one
+// computation-request... this is to test some of the
+// low-level utilities that will be used in online computation.
+void UnitTestNnetCompileMulti() {
+  for (int32 n = 0; n < 20; n++) {
+    struct NnetGenerationOptions gen_config;
+    gen_config.allow_use_of_x_dim = false;
+
+    std::vector<std::string> configs;
+    GenerateConfigSequence(gen_config, &configs);
+    Nnet nnet;
+    for (size_t j = 0; j < configs.size(); j++) {
+      KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
+      std::istringstream is(configs[j]);
+      nnet.ReadConfig(is);
+    }
+
+    ComputationRequest request1, request2;
+    std::vector<Matrix<BaseFloat> > inputs1, inputs2;
+    ComputeExampleComputationRequestSimple(nnet, &request1, &inputs1);
+    ComputeExampleComputationRequestSimple(nnet, &request2, &inputs2);
+
+
+    KALDI_LOG << "Computation request 1 is:";
+    request1.Print(std::cerr);
+    KALDI_LOG << "Computation request 2 is:";
+    request2.Print(std::cerr);
+
+    std::vector<const ComputationRequest*> requests;
+    request2.store_component_stats = request1.store_component_stats;
+    request1.need_model_derivative = false;
+    request2.need_model_derivative = false;
+    requests.push_back(&request1);
+    requests.push_back(&request2);
+
+    // set all the x indexes to 1 for request 2 (they would otherwise
+    // be zero).  This ensures that there is no overlap
+    // between the inputs and outputs on the two requests.
+    for (int32 i = 0; i < request2.inputs.size(); i++)
+      for (int32 j = 0; j < request2.inputs[i].indexes.size(); j++)
+        request2.inputs[i].indexes[j].x = 1;
+    for (int32 i = 0; i < request2.outputs.size(); i++)
+      for (int32 j = 0; j < request2.outputs[i].indexes.size(); j++)
+        request2.outputs[i].indexes[j].x = 1;
+
+
+    NnetComputation computation;
+    Compiler compiler(requests, nnet);
+
+    CompilerOptions opts;
+    compiler.CreateComputation(opts, &computation);
+
+    std::ostringstream os;
+    computation.Print(os, nnet);
+    KALDI_LOG << "Generated computation is: " << os.str();
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
 
@@ -65,6 +124,7 @@ int main() {
   // SetVerboseLevel(2);
 
   UnitTestNnetCompile();
+  UnitTestNnetCompileMulti();
 
   KALDI_LOG << "Nnet tests succeeded.";
 
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 8e70ecb4c4c..ab4ea9917e3 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -69,21 +69,23 @@ void Compiler::CreateComputation(const CompilerOptions &opts,
   // (non-online computation), a vector of all zeros.
   std::vector<int32> step_to_segment;
 
-  for (size_t segment = 0; segment < requests_.size(); segment++) {
-    std::vector<std::vector<int32> > this_segment_steps;
-    ComputeComputationSteps(nnet_, *(requests_[segment]),
-                            phases_per_segment[segment], &graph_,
-                            &this_segment_steps);
-    for (size_t i = 0; i < this_segment_steps.size(); i++) {
-      steps.push_back(std::vector<int32>());
-      steps.back().swap(this_segment_steps[i]);
-      step_to_segment.push_back(segment);
+
+  {
+    ComputationStepsComputer steps_computer(nnet_, &graph_, &steps,
+                                            &cindex_id_to_location_);
+
+    for (size_t segment = 0; segment < requests_.size(); segment++) {
+      steps_computer.ComputeForSegment(*(requests_[segment]),
+                                       phases_per_segment[segment]);
+      while (step_to_segment.size() < steps.size())
+        step_to_segment.push_back(segment);
+
+      // save memory, by deleting the phases we just consumed.
+      std::vector<std::vector<int32> > temp;
+      phases_per_segment[segment].swap(temp);
     }
+    steps_computer.Check();
   }
-  // TODO (?) check that the total num_cindexes in the steps in >=
-  // graph->cindexes.size().  could do it inside CreateLocationInfo().
-  phases_per_segment.clear();
-  CreateLocationInfo(steps);
   std::vector<bool> deriv_needed;
   ComputeDerivNeeded(steps, step_to_segment, &deriv_needed);
   CreateStepInfo(deriv_needed, step_to_segment, &steps, computation);
@@ -354,37 +356,6 @@ void Compiler::CreateStepInfo(
   }
 }
 
-void Compiler::CreateLocationInfo(
-    const std::vector<std::vector<int32> > &by_step) {
-  cindex_id_to_location_.clear();
-  int32 num_cindex_ids = graph_.cindexes.size(),
-      total_cindex_ids = 0;
-  cindex_id_to_location_.resize(num_cindex_ids, std::pair<int32,int32>(-1,-1));
-  int32 num_steps = by_step.size();
-  for (int32 step = 0; step < num_steps; step++) {
-    // output_cindex_ids is the cindex_ids that this step produces.
-    const std::vector<int32> &output_cindex_ids = by_step[step];
-    total_cindex_ids += output_cindex_ids.size();
-    int32 num_rows = output_cindex_ids.size();
-    for (int32 row = 0; row < num_rows; row++) {
-      int32 cindex_id = output_cindex_ids[row];
-      if (cindex_id_to_location_[cindex_id].first != -1) {
-        int32 node_id = graph_.cindexes[cindex_id].first;
-        if (nnet_.GetNode(node_id).node_type != kDescriptor ||
-            nnet_.GetNode(node_id + 1).node_type != kComponent)
-          KALDI_ERR << "Cindexes may appear in >1 step only if they are "
-              "Descriptors for Component inputs: code error.";
-      }
-      cindex_id_to_location_[cindex_id] = std::pair<int32,int32>(step, row);
-    }
-  }
-  // All cindex_ids in the graph must be present in a step, which is why
-  // we make the following assert.  In general this will be with equality,
-  // but I believe there might be some weird edge cases, maybe involving
-  // kDimRange nodes, that would make this not true.  [not 100% sure.]
-  KALDI_ASSERT(total_cindex_ids >= num_cindex_ids);
-}
-
 void Compiler::DoForwardComputation(int32 step,
                                     NnetComputation *computation) const {
   KALDI_ASSERT(step < static_cast<int32>(steps_.size()));
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index 8b9e738d251..36fcf84fbf1 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -110,9 +110,6 @@ class Compiler {
                 precomputed_indexes_index(0) { }
   };
 
-  // this sets up cindex_id_to_location_.
-  void CreateLocationInfo(const std::vector<std::vector<int32> > &by_step);
-
   // Computes the set of step-indexes of preceding steps that this step depends
   // on.  Assumes CreateLocationInfo() has already been called.  Requires
   // 'step_index' only to handle a special case, that if 'this_step' is a
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index cf43ca9f804..422a14bfe4c 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -465,6 +465,7 @@ void ComputationGraphBuilder::Compute(const ComputationRequest &request) {
     KALDI_ERR << "You are calling things in the wrong order: should be "
               << "Compute(), Prune(), Compute, Prune(), ...";
   }
+  int32 cur_segment_start = graph_->cindexes.size();
   request_ = &request;
   AddInputs();
   AddOutputs();  // sets current_distance_ to 0.
@@ -474,7 +475,7 @@ void ComputationGraphBuilder::Compute(const ComputationRequest &request) {
     BuildGraphOneIter();
     // only check rarely if we're running at low verbose level.
     if (GetVerboseLevel() >= 3 || RandInt(1,  (current_distance_ + 1)) == 1)
-      Check();
+      Check(cur_segment_start);
     // TODO: come up with a scheme to delay when we call
     // UpdateAllComputableInfo().
     UpdateAllComputableInfo();
@@ -486,13 +487,13 @@ void ComputationGraphBuilder::Compute(const ComputationRequest &request) {
               << "network topology?)";
 
   if (RandInt(1, 2 * (graph_->segment_ends.size() + 1)) == 1)
-    Check();
+    Check(cur_segment_start);
 }
 
 
-void ComputationGraphBuilder::Check() const {
+void ComputationGraphBuilder::Check(int32 start_cindex_id) const {
   int32 num_cindex_ids = graph_->cindexes.size();
-  for (int32 cindex_id = 0; cindex_id < num_cindex_ids;
+  for (int32 cindex_id = start_cindex_id; cindex_id < num_cindex_ids;
        cindex_id += 1 + RandInt(0, num_cindex_ids / 100)) {
     { // check depend_on_this.
       std::vector<int32> depend_on_this = depend_on_this_[cindex_id];
@@ -513,12 +514,16 @@ void ComputationGraphBuilder::Check() const {
       KALDI_ASSERT(IsSortedAndUniq(dependencies));
       for (size_t j = 0; j < size; j++) {
         int32 dep_cindex_id = dependencies[j];
-        // make sure appears in appropriate depend_on_this_ array.
-        const std::vector<int32> &dep = depend_on_this_[dep_cindex_id];
-        KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
+        if (dep_cindex_id >= start_cindex_id) {
+          // make sure appears in appropriate depend_on_this_ array.
+          const std::vector<int32> &dep = depend_on_this_[dep_cindex_id];
+          KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
+        }
       }
     }
-    { // check usable_count_.
+
+    {
+      // check usable_count_
       int32 node_index = graph_->cindexes[cindex_id].first;
       int32 usable_count = usable_count_[cindex_id],
           usable_count_recomputed = nnet_.IsOutputNode(node_index) ? 1 : 0;
@@ -1521,423 +1526,452 @@ bool IndexSet::operator () (const Index &index) const {
 }
 
 
+ComputationStepsComputer::ComputationStepsComputer(
+    const Nnet &nnet,
+    ComputationGraph *graph,
+    std::vector<std::vector<int32> > *steps,
+    std::vector<std::pair<int32, int32> > *locations):
+    nnet_(nnet), graph_(graph), steps_(steps), locations_(locations) {
+  steps_->clear();
+  locations_->clear();
+  int32 num_cindexes = graph_->cindexes.size();
+  // leave a little space in case a few cindexes are added (unlikely
+  // but could happen with dim-range nodes).
+  locations_->reserve(num_cindexes + num_cindexes / 10);
+  locations_->resize(num_cindexes, std::pair<int32,int32>(-1, -1));
+}
 
-namespace compute_computation_steps {
-// namespace for some helper functions for ComputeComputationSteps.
-
-/// Adds a "step" for each of the inputs in the ComputationRequest.
-/// Does this in the same order in which they were declared in
-/// the request (this order won't matter at all).
-/// returns the total number of cindex_ids that correspond to inputs.
-int32 AddInputSteps(const Nnet &nnet,
-                    const ComputationRequest &request,
-                    const ComputationGraph &graph,
-                    std::vector<std::vector<int32> > *steps) {
-  KALDI_ASSERT(steps->empty());
-  steps->reserve(50);  // will minimize unnecessary copies of vectors.
-  unordered_set<int32> all_nodes;  // to make sure nothing is listed twice.
-  int32 num_cindex_ids = 0;
-  for (int32 i = 0; i < request.inputs.size(); i++) {
-    int32 n = nnet.GetNodeIndex(request.inputs[i].name);
-    if (n == -1)
-      KALDI_ERR << "Network has no output with name "
-                << request.inputs[i].name;
-    // ensure no input node is listed twice.
-    KALDI_ASSERT(all_nodes.count(n) == 0 && "Invalid computation request: "
-                 "double listing of node.");
-    all_nodes.insert(n);
-    KALDI_ASSERT(!request.inputs[i].indexes.empty() &&
-                 "Computation request had no indexes for input ");
-    steps->push_back(std::vector<int32>());
-    std::vector<int32> &this_step = steps->back();
-    this_step.resize(request.inputs[i].indexes.size());
-    for (int32 j = 0; j < request.inputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request.inputs[i].indexes[j]);
-      int32 cindex_id = graph.GetCindexId(cindex);
-      KALDI_ASSERT(cindex_id != -1);  // would be code error.
-      this_step[j] = cindex_id;
+void ComputationStepsComputer::ComputeForSegment(
+    const ComputationRequest &request,
+    const std::vector<std::vector<int32> > &phases) {
+  int32 this_num_phases = phases.size();
+  for (int32 i = 0; i < this_num_phases; i++) {
+    std::vector<std::vector<Cindex> > sub_phases;
+    SplitIntoSubPhases(phases[i], &sub_phases);
+    for (size_t j = 0; j < sub_phases.size(); j++) {
+      ProcessSubPhase(request, sub_phases[j]);
     }
-    num_cindex_ids += request.inputs[i].indexes.size();
   }
-  return num_cindex_ids;
 }
 
+void ComputationStepsComputer::ProcessInputOrOutputStep(
+    const ComputationRequest &request,
+    bool is_output,
+    const std::vector<Cindex> &sub_phase) {
+  int32 io_node = sub_phase[0].first;
+  if (is_output){
+    KALDI_ASSERT(nnet_.IsOutputNode(io_node));
+  } else {
+    KALDI_ASSERT(nnet_.IsInputNode(io_node));
+  }
+  std::string node_name = nnet_.GetNodeName(io_node);
+  const std::vector<IoSpecification> &inputs_or_outputs =
+      (is_output ? request.outputs : request.inputs);
+  int32 io_index = -1;
+  for (size_t i = 0; i < inputs_or_outputs.size(); i++)
+    if (inputs_or_outputs[i].name == node_name)
+      io_index = i;
+  KALDI_ASSERT(io_index >= 0);
+  const std::vector<Index> &io_indexes = inputs_or_outputs[io_index].indexes;
+  std::vector<Cindex> io_cindexes(io_indexes.size());
+  for (size_t i = 0, size = io_cindexes.size(); i < size; i++) {
+    io_cindexes[i].first = io_node;
+    io_cindexes[i].second = io_indexes[i];
+  }
+  KALDI_ASSERT(io_cindexes.size() == sub_phase.size());
+  // we expect the list of cindexes in 'io_cindexes' to be identical to
+  // that in 'sub_phase' (but they don't have to be in the same order)... for now we check the size, we'll spot-check
+  // that they are the same later.
+  // The actual output in 'steps' must be in the same order as
+  int32 step_index = AddStep(io_cindexes);
+  // Now spot-check that the cindexes in 'sub_phase' are the same as those
+  // we just added.  [note: they don't have to be in the same order, but
+  // they should be the same set.]
+  for (size_t i = 0; i < sub_phase.size(); i += 10) {
+    const Cindex &cindex = sub_phase[i];
+    int32 cindex_id = graph_->GetCindexId(cindex);
+    KALDI_ASSERT(cindex_id >= 0 && (*locations_)[cindex_id].first == step_index);
+  }
+}
 
-/// Adds a "step" for each of the outputs in the ComputationRequest.  This will
-/// be done after adding steps for all the inputs and then all the
-/// non(input/output)s.  Does this in the same order in which they were declared
-/// in the request (this won't matter at all).
-void AddOutputSteps(const Nnet &nnet,
-                    const ComputationRequest &request,
-                    const ComputationGraph &graph,
-                    std::vector<std::vector<int32> > *steps) {
-  std::set<int32> all_nodes;  // to make sure nothing listed twice.
-  for (int32 i = 0; i < request.outputs.size(); i++) {
-    int32 n = nnet.GetNodeIndex(request.outputs[i].name);
-    if (n == -1)
-      KALDI_ERR << "Network has no output with name "
-                << request.outputs[i].name;
-    // ensure no output node is listed twice.
-    KALDI_ASSERT(all_nodes.count(n) == 0 && "Invalid computation request: "
-                 "double listing of node.");
-    all_nodes.insert(n);
-    KALDI_ASSERT(!request.outputs[i].indexes.empty() &&
-                 "Computation request had no indexes for output ");
-    steps->push_back(std::vector<int32>());
-    std::vector<int32> &this_step = steps->back();
-    this_step.resize(request.outputs[i].indexes.size());
-    for (int32 j = 0; j < request.outputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request.outputs[i].indexes[j]);
-      int32 cindex_id = graph.GetCindexId(cindex);
-      KALDI_ASSERT(cindex_id != -1);  // would be code error.
-      this_step[j] = cindex_id;
+int32 ComputationStepsComputer::AddStep(const std::vector<Cindex> &cindexes,
+                                        bool add_if_absent) {
+  int32 step_index = steps_->size();
+  steps_->push_back(std::vector<int32>());
+  std::vector<int32> &step = steps_->back();  // vector of cindex_id.
+  step.resize(cindexes.size());
+  size_t row_index = 0;
+  std::vector<Cindex>::const_iterator iter = cindexes.begin(),
+      end = cindexes.end();
+  std::vector<int32>::iterator out_iter = step.begin();
+  std::pair<int32, int32> *locations = &((*locations_)[0]);
+  if (!add_if_absent) {
+    // this version of GetCindexId will not add CindexIds.
+    for (; iter != end; ++iter, ++out_iter, ++row_index) {
+      int32 cindex_id = graph_->GetCindexId(*iter);
+      *out_iter = cindex_id;
+      locations[cindex_id].first = step_index;
+      locations[cindex_id].second = row_index;
+    }
+  } else {
+    for (; iter != end; ++iter, ++out_iter, ++row_index) {
+      bool is_input = false;  // only relevant if we have to add the cindex to
+                              // the computation graph, which we won't for
+                              // inputs (we only might for dim-range nodes).
+      bool added;
+      int32 cindex_id = graph_->GetCindexId(*iter, is_input, &added);
+      *out_iter = cindex_id;
+      if (added) {
+        KALDI_ASSERT(cindex_id == static_cast<int32>(locations_->size()));
+        locations_->resize(cindex_id + 1);
+        locations_->back().first = step_index;
+        locations_->back().second = row_index;
+        locations = &((*locations_)[0]);  // in case it was reallocated
+      } else {
+        locations[cindex_id].first = step_index;
+        locations[cindex_id].second = row_index;
+      }
     }
   }
+  return step_index;
 }
 
-/// Convert the cindex_ids in the vector "cindex_ids" to cindexes, but only
-/// keeping those that correspond to nodes of type kComponent.
-/// Asserts that none of these cindexes have the "is_input" set to true.
-/// [this is possible because we call this only for phases >1, and inputs
-/// should not be there.]
-static void ExtractOnlyComponentCindexes(const std::vector<int32> &cindex_ids,
-                                         const ComputationGraph &graph,
-                                         const Nnet &nnet,
-                                         std::vector<Cindex> *cindexes) {
-  cindexes->clear();
-  cindexes->reserve(cindex_ids.size());
+
+int32 ComputationStepsComputer::AddStep(std::vector<int32> *cindex_ids) {
+  int32 step_index = steps_->size();
+  KALDI_ASSERT(!cindex_ids->empty());
+  steps_->push_back(std::vector<int32>());
+  steps_->back().swap(*cindex_ids);
+  std::vector<int32>::const_iterator iter = steps_->back().begin(),
+      end = steps_->back().end();
+  int32 row_index = 0;
+  std::pair<int32,int32> *locations = &((*locations_)[0]);
+  size_t num_cindexes = graph_->cindexes.size();
+  for (; iter != end; ++iter, ++row_index) {
+    int32 cindex_id = *iter;
+    KALDI_ASSERT(static_cast<size_t>(cindex_id) < num_cindexes);
+    locations[cindex_id].first = step_index;
+    locations[cindex_id].second = row_index;
+  }
+  return step_index;
+}
+
+
+void ComputationStepsComputer::ConvertToCindexes(
+    const std::vector<int32> &cindex_ids,
+    std::vector<Cindex> *cindexes) const {
+  cindexes->resize(cindex_ids.size());
+  size_t num_cindexes = graph_->cindexes.size();
   std::vector<int32>::const_iterator iter = cindex_ids.begin(),
-                                      end = cindex_ids.end();
-  for (; iter != end; ++iter) {
+      end = cindex_ids.end();
+  std::vector<Cindex>::iterator out_iter = cindexes->begin();
+  for (; iter != end; ++iter, ++out_iter) {
     int32 cindex_id = *iter;
-    const Cindex &cindex = graph.cindexes[cindex_id];
-    if (nnet.IsComponentNode(cindex.first)) {
-      KALDI_ASSERT(!graph.is_input[cindex_id]);
-      cindexes->push_back(cindex);
-    }
+    KALDI_ASSERT(static_cast<size_t>(cindex_id) < num_cindexes);
+    *out_iter = graph_->cindexes[cindex_id];
   }
 }
 
-/// Outputs into component_steps, steps corresponding to all Cindexes that
-/// correspond to Component nodes and that are not inputs to the network.  (note
-/// that a Cindex for a Component node that's provided as an input to the
-/// network is not case we anticipate being common, but it's possible in the
-/// framework).  Note, a step is just a list of cindex_ids that can all be computed
-/// at the same time.
-static void AddComponentSteps(
-    const Nnet &nnet,
-    const ComputationGraph &graph,
-    const std::vector<std::vector<int32> > &phases,
-    std::vector<std::vector<int32> > *component_steps) {
-  int32 num_phase_indexes = phases.size();
-
-  std::vector<Cindex> cindexes;
-
-  // We don't include phase_index = 0, because all inputs to the network
-  // (whether the node index is type kInput or kComponent) will be assigned to
-  // phase_index 0, and no non-inputs should be there (we checked this).
-  for (int32 phase_index = 1; phase_index < num_phase_indexes; phase_index++) {
-    ExtractOnlyComponentCindexes(phases[phase_index], graph, nnet, &cindexes);
-
-    // now "cindexes" contains all Cindexes that are from Component nodes (and
-    // we have made sure that none of these are being provided as inputs).
-    // Sorting this array gives us the ordering we want, where Cindexes from
-    // different node-ids are separated into contiguous ranges, and within each
-    // range, they are sorted by Index.
-    std::sort(cindexes.begin(), cindexes.end());
-
-    std::vector<Cindex>::iterator iter = cindexes.begin(), end = cindexes.end();
-    while (iter != end) {
-      // each pass through this while loop processes one batch of cindex_ids;
-      // each batch has a particular node-index.
-      std::vector<Cindex>::iterator cur_end = iter;
-      int32 this_node_id = iter->first;
-      while (cur_end != end && cur_end->first == this_node_id)
-        cur_end++;
-      // the range [iter, cur_end) is nonempty and contains all the same node-id.
-      int32 size = cur_end - iter;
-      component_steps->push_back(std::vector<int32>());
-      std::vector<int32> &this_step = component_steps->back();
-      this_step.resize(size);
-      for (int32 i = 0; i < size; i++, iter++)
-        this_step[i] = graph.GetCindexId(*iter);
-      KALDI_ASSERT(iter == cur_end);
-      // at this point iter will point to either the end of the "cindexes"
-      // vector, or the beginning of the next set of Cindexes to process.
-    }
+
+void ComputationStepsComputer::ConvertToCindexIds(
+    const std::vector<Cindex> &cindexes,
+    std::vector<int32> *cindex_ids) const {
+  cindex_ids->resize(cindexes.size());
+  std::vector<Cindex>::const_iterator iter = cindexes.begin(),
+      end = cindexes.end();
+  std::vector<int32>::iterator out_iter = cindex_ids->begin();
+  for (; iter != end; ++iter, ++out_iter) {
+    int32 cindex_id = graph_->GetCindexId(*iter);
+    KALDI_ASSERT(cindex_id >= 0);
+    *out_iter = cindex_id;
   }
 }
 
 
-/// You call this function after calling AddInputSteps to add steps for inputs
-/// to "all_steps", then calling AddComponentSteps to output steps for
-/// components to "component_steps".  This function moves the component steps
-/// from "component_steps" to "all_steps", while preceding each component step
-/// with a corresponding step for setting up the input to that component (i.e. a
-/// step for the preceding Descriptor).  The reason we do it like this is (a) to
-/// ensure that the step for the input to the Component, which comes from a
-/// Descriptor, comes immediately before it, which is convenient; and (b)
-/// because it's possible in certain rather weird setups, some Cindexes
-/// corresponding to the Descriptors at the inputs of Components will end up
-/// being listed in two separate steps; and if we added the input-descriptor
-/// steps using the same mechanism as AddComponentSteps, we wouldn't be able to
-/// correctly capture this duplication.
-static void AddComponentInputSteps(
-    const ComputationGraph &graph,
-    std::vector<std::vector<int32> > *component_steps,
-    std::vector<std::vector<int32> > *all_steps) {
+// static
+void ComputationStepsComputer::ConvertToIndexes(
+    const std::vector<Cindex> &cindexes,
+    std::vector<Index> *indexes) {
+  indexes->resize(cindexes.size());
+  std::vector<Cindex>::const_iterator iter = cindexes.begin(),
+      end = cindexes.end();
+  std::vector<Index>::iterator out_iter = indexes->begin();
+  for (; iter != end; ++iter, ++out_iter)
+    *out_iter = iter->second;
+}
+
+// static
+void ComputationStepsComputer::ConvertToCindexes(
+    const std::vector<Index> &indexes,
+    int32 node_index,
+    std::vector<Cindex> *cindexes) {
+  KALDI_ASSERT(node_index >= 0);
+  cindexes->resize(indexes.size());
+  std::vector<Index>::const_iterator iter = indexes.begin(),
+      end = indexes.end();
+  std::vector<Cindex>::iterator out_iter = cindexes->begin();
+  for (; iter != end; ++iter, ++out_iter) {
+    out_iter->first = node_index;
+    out_iter->second = *iter;
+  }
+}
 
-  int32 space_for_outputs = 10;  // arbitrary.
-  all_steps->reserve(all_steps->size() +
-                     component_steps->size() * 2 + space_for_outputs);
 
 
-  for (size_t i = 0; i < component_steps->size(); i++) {
-    std::vector<int32> &component_step = (*component_steps)[i];
-    KALDI_ASSERT(!component_step.empty());
-    // First make a step for the descriptor at the input of this Component.
-    unordered_set<int32> descriptor_cindex_ids;
-    std::vector<int32>::iterator iter = component_step.begin(),
-        end = component_step.end();
+
+void ComputationStepsComputer::ProcessComponentStep(
+    const std::vector<Cindex> &step) {
+  KALDI_ASSERT(!step.empty());
+  int32 component_node_index = step.front().first;
+  int32 component_input_index = component_node_index - 1;
+  KALDI_ASSERT(nnet_.IsComponentNode(component_node_index));
+  const NetworkNode &node = nnet_.GetNode(component_node_index);
+  int32 c = node.u.component_index;
+  const Component *component = nnet_.GetComponent(c);
+  if (component->Properties() & kSimpleComponent) {
+    // for simple components, the input cindexes will be the same as the
+    // output ones except for the node index, so we do a shortcut that's
+    // faster (no following dependencies).
+    std::vector<Cindex> input_step(step.size());
+    input_step.resize(step.size());
+    std::vector<Cindex>::iterator iter = input_step.begin(),
+        end = input_step.end();
+    std::vector<Cindex>::const_iterator src = step.begin();
+    for (; iter != end; ++iter,++src) {
+      iter->first = component_input_index;
+      iter->second = src->second;
+    }
+    AddStep(input_step);
+    AddStep(step);
+  } else {
+    std::vector<int32> step_cindex_ids;
+    ConvertToCindexIds(step, &step_cindex_ids);
+    // to get the input cindexes we need to follow dependencies back.
+    unordered_set<int32> input_cindex_ids;
+    std::vector<int32>::iterator iter = step_cindex_ids.begin(),
+        end = step_cindex_ids.end();
     for (; iter != end; ++iter) {
       int32 c = *iter;
-      const std::vector<int32> &dependencies = graph.dependencies[c];
+      const std::vector<int32> &dependencies = graph_->dependencies[c];
       std::vector<int32>::const_iterator dep_iter = dependencies.begin(),
           dep_end = dependencies.end();
       for (; dep_iter != dep_end; ++dep_iter) {
         int32 d = *dep_iter;
-        descriptor_cindex_ids.insert(d);
+        input_cindex_ids.insert(d);
       }
     }
     // Convert to Cindexes so we can sort them as Cindexes.
-    std::vector<Cindex> descriptor_cindexes;
-    descriptor_cindexes.reserve(descriptor_cindex_ids.size());
-    unordered_set<int32>::iterator set_iter = descriptor_cindex_ids.begin(),
-        set_end = descriptor_cindex_ids.end();
+    std::vector<Cindex> input_step;
+    input_step.reserve(input_cindex_ids.size());
+    unordered_set<int32>::iterator set_iter = input_cindex_ids.begin(),
+        set_end = input_cindex_ids.end();
     for (; set_iter != set_end; ++set_iter) {
       int32 c = *set_iter;
-      descriptor_cindexes.push_back(graph.cindexes[c]);
+      input_step.push_back(graph_->cindexes[c]);
     }
-    // sort the cindexes.
-    std::sort(descriptor_cindexes.begin(), descriptor_cindexes.end());
-
-    // We technically allow a Component with no input, e.g. in case where for
-    // some reason it decides it has no dependencies, e.g. it has a constant
-    // output.  In this case we create an empty step, to preserve the property
-    // that the step for the Component's input comes immediately before the step
-    // for the Component itself.
-    if (!descriptor_cindexes.empty()) {
-      // Make sure all these cindexes come from the same node_id, which should
-      // be the one immediately preceding the Component node_id of
-      // "component_step".
-      int32 node_id = descriptor_cindexes.front().first;
-      KALDI_ASSERT(descriptor_cindexes.back().first == node_id &&
-                   graph.cindexes[component_step.front()].first == node_id + 1);
-    }
-    // Now that we've sorted, convert back to cindex_ids (this list will be
-    // the "step").
-    int32 size = descriptor_cindexes.size();
-    std::vector<int32> descriptor_step(size);
-    for (int32 i = 0; i < size; i++) {
-      descriptor_step[i] = graph.GetCindexId(descriptor_cindexes[i]);
-      KALDI_ASSERT(descriptor_step[i] != -1);
+    // sort the input cindexes.
+    std::sort(input_step.begin(), input_step.end());
+
+    if (component->Properties() & kReordersIndexes) {
+      std::vector<Index> indexes, input_indexes;
+      ConvertToIndexes(input_step, &input_indexes);
+      ConvertToIndexes(step, &indexes);
+
+      // the component wants to have the opportunity to change the
+      // order of these indexes from their default.
+      component->ReorderIndexes(&input_indexes, &indexes);
+
+      // Now convert back from indexes to cindexes (we know the
+      // node-index in each case)
+      std::vector<Cindex> reordered_step;
+      ConvertToCindexes(indexes, component_node_index, &reordered_step);
+      ConvertToCindexes(input_indexes, component_input_index, &input_step);
+      AddStep(input_step);
+      AddStep(reordered_step);
+    } else {
+      AddStep(input_step);
+      // it's more efficient to add the step with cindex_ids; and we have these
+      // available, so we do it that way.  (in the other branch where
+      // the flag kReordersIndexes was present, we couldn't do this because
+      // of the reordering).
+      AddStep(&step_cindex_ids);
     }
-    // efficiently add descriptor_step to the end of all_steps.
-    all_steps->push_back(std::vector<int32>());
-    all_steps->back().swap(descriptor_step);
-
-    // efficiently add component_step to the end of all_steps (this destroys the
-    // input, which we won't be needing any more).
-    all_steps->push_back(std::vector<int32>());
-    all_steps->back().swap(component_step);
   }
-  component_steps->clear();
 }
 
 
-static void CreateCindexIdToStep(
-    const ComputationGraph &graph,
-    const std::vector<std::vector<int32> > &all_steps,
-    std::vector<int32> *cindex_id_to_step) {
-  int32 num_cindex_ids = graph.cindexes.size();
-  cindex_id_to_step->clear();
-  cindex_id_to_step->resize(num_cindex_ids, -1);
-  int32 num_steps = all_steps.size();
-  for (int32 step = 0; step < num_steps; step++) {
-    std::vector<int32>::const_iterator iter = all_steps[step].begin(),
-        end = all_steps[step].end();
-    for (; iter != end; ++iter) {
-      int32 cindex_id = *iter;
-      (*cindex_id_to_step)[cindex_id] = step;
-    }
+void ComputationStepsComputer::ConvertToLocations(
+    const std::vector<int32> &cindex_ids,
+    std::vector<std::pair<int32, int32> > *locations) const {
+  locations->resize(cindex_ids.size());
+  std::vector<int32>::const_iterator iter = cindex_ids.begin(),
+      end = cindex_ids.end();
+  std::vector<std::pair<int32, int32> >::iterator out_iter =
+      locations->begin();
+  // note, locations_ and locations are different variables.
+  std::pair<int32, int32> *locations_ptr = &((*locations_)[0]);
+  size_t num_cindexes = locations_->size();
+  for (; iter != end; ++iter, ++out_iter) {
+    int32 cindex_id = *iter;
+    KALDI_ASSERT(static_cast<size_t>(cindex_id) < num_cindexes);
+    int32 step = locations_ptr[cindex_id].first,
+        row = locations_ptr[cindex_id].second;
+    KALDI_ASSERT(step >= 0);
+    out_iter->first = step;
+    out_iter->second = row;
   }
 }
 
-/// This function inserts into "all_steps", which at this point should contain
-/// all but the output steps, steps corresponding to any nodes of type kDimRange.
-/// "graph" is non-const as there are situations in which we might need to
-/// add cindexes for nodes of type kDimRange.
-static void AddDimRangeSteps(
-    const Nnet &nnet,
-    ComputationGraph *graph,
-    std::vector<std::vector<int32> > *all_steps) {
-  int32 num_nodes = nnet.NumNodes();
-  bool dim_range_node_exists = false;
-  std::vector<char> is_dim_range_node(num_nodes, '\0');
-  for (int32 n = 0; n < num_nodes; n++) {
-    if (nnet.IsDimRangeNode(n)) {
-      is_dim_range_node[n] = (char)1;
-      dim_range_node_exists = true;
+void ComputationStepsComputer::ProcessDimRangeSubPhase(
+    const std::vector<Cindex> &sub_phase) {
+  int32 dim_range_node = sub_phase[0].first;
+  KALDI_ASSERT(nnet_.IsDimRangeNode(dim_range_node));
+  const NetworkNode &node = nnet_.GetNode(dim_range_node);
+  // 'input_node_index' is the node index of the component or input node
+  // that this dim-range node gets its input from.
+  int32 input_node_index = node.u.node_index;
+  // input_cindexes will give us the cindexes of the component or input node
+  // that is the input to this dim-range node
+  std::vector<Cindex> input_cindexes(sub_phase);
+  for (std::vector<Cindex>::iterator iter = input_cindexes.begin(),
+           end = input_cindexes.end(); iter != end; ++iter)
+    iter->first = input_node_index;
+  std::vector<int32> input_cindex_ids;
+  ConvertToCindexIds(input_cindexes, &input_cindex_ids);
+  std::vector<std::pair<int32, int32> > locations;
+  ConvertToLocations(input_cindex_ids, &locations);
+  std::sort(locations.begin(), locations.end());
+  KALDI_ASSERT(!locations.empty());
+  std::vector<std::pair<int32, int32> >::const_iterator
+      locations_iter = locations.begin(),
+      locations_end = locations.end();
+  // Each unique .first number in locations (i.e. each source step, and they
+  // will all correspond to component-output or input steps) will generate one
+  // 'step' of type kDimRange.  Because dim-range nodes must be contiguous
+  // ranges of a source step (since they are represented as sub-matrices), for
+  // each source step we work out the first and last row-index (i.e. first and
+  // last .second member of locations) and use that to reconstruct the range.
+
+  // each element of 'steps' will be (source_step, (begin_row, end_row)) so that
+  // the source of the dim-range node is indexes begin_row ... end_row-1 in that
+  // source step.
+  std::vector<std::pair<int32, std::pair<int32, int32> > > steps;
+
+  int32 cur_source_step = locations_iter->first,
+      cur_row_begin = locations_iter->second,
+      cur_row_end = cur_row_begin + 1;
+  while (1) {
+    ++locations_iter;
+    if (locations_iter == locations_end ||
+        locations_iter->first != cur_source_step) {
+      // we reached the end of a run of the same step.
+      std::pair<int32, std::pair<int32, int32> > this_step;
+      this_step.first = cur_source_step;
+      this_step.second.first = cur_row_begin;
+      this_step.second.second = cur_row_end;
+      steps.push_back(this_step);
+      if (locations_iter != locations_end) {
+        cur_source_step = locations_iter->first;
+        cur_row_begin = locations_iter->second;
+        cur_row_end = cur_row_begin + 1;
+      } else {
+        break;
+      }
+    } else {
+      cur_row_end = locations_iter->second + 1;
     }
   }
-  if (!dim_range_node_exists)
-    return;
 
-  std::vector<int32> cindex_id_to_step;
-  CreateCindexIdToStep(*graph, *all_steps, &cindex_id_to_step);
-  int32 num_steps = all_steps->size();
-
-  // We are going to insert steps for nodes of type kDimRange just after the
-  // kInput or kComponent steps that the kDimRange nodes refer to.
-  // new_nodes_per_step will be a list of any nodes of type kDimRange that
-  // have input corresponding to something in that step.
-  std::vector<std::set<int32> > new_nodes_per_step(num_steps);
-  int32 num_cindex_ids = graph->cindexes.size();
-  std::vector<Cindex>::const_iterator iter = graph->cindexes.begin();
-  for (int32 i = 0; i < num_cindex_ids; i++,iter++) {
-    const Cindex &cindex = *iter;
-    int32 node_index = cindex.first;
-    if (!is_dim_range_node[node_index])
-      continue;
-    const NetworkNode &node = nnet.GetNode(node_index);
-    Cindex input_cindex(node.u.node_index, cindex.second);
-    int32 input_cindex_id = graph->GetCindexId(input_cindex);
-    KALDI_ASSERT(input_cindex_id != -1);
-    int32 input_step = cindex_id_to_step[input_cindex_id];
-    KALDI_ASSERT(input_step != -1);
-    new_nodes_per_step[input_step].insert(node_index);
-  }
-  int32 num_new_steps = 0, space_for_output = 10;
-  for (int32 step = 0; step < num_steps; step++)
-    num_new_steps += new_nodes_per_step[step].size();
-
-  // we'll later swap all_steps_out with all_steps.
-  std::vector<std::vector<int32> > all_steps_out;
-  all_steps_out.reserve(num_steps + num_new_steps + space_for_output);
-  for (int32 step = 0; step < num_steps; step++) {
-    std::vector<int32> &this_step = (*all_steps)[step];
-    int32 cur_out_index = all_steps_out.size();
-    all_steps_out.push_back(std::vector<int32>());  // make space for this step.
-    std::set<int32>::iterator iter = new_nodes_per_step[step].begin(),
-        end = new_nodes_per_step[step].end();
-    for (; iter != end; ++iter) {
-      int32 node = *iter, size = this_step.size();
-      std::vector<int32> new_step(size);
-      for (int32 i = 0; i < size; i++) {
-        int32 cindex_id = this_step[i];
-        Cindex dimrange_cindex(node, graph->cindexes[cindex_id].second);
-        bool input = false, is_new;
-        int32 dimrange_cindex_id = graph->GetCindexId(dimrange_cindex,
-                                                      input, &is_new);
-        new_step[i] = dimrange_cindex_id;
-        if (is_new) {  // if we newly added this cindex_id, note the dependency
-                       // on its input.
-          graph->dependencies[dimrange_cindex_id].push_back(cindex_id);
-        }
-      }
-      all_steps_out.push_back(std::vector<int32>());
-      all_steps_out.back().swap(new_step);
-    }
-    all_steps_out[cur_out_index].swap(this_step);
+  for (size_t i = 0; i < steps.size(); i++) {
+    // iterating over different source steps, although normally
+    // there will be just one.
+    int32 source_step = steps[i].first,
+        row_begin = steps[i].second.first,
+        row_end = steps[i].second.second;
+    // 'source' is just the elements of the source step that we're consuming.
+    std::vector<int32> source((*steps_)[source_step].begin() + row_begin,
+                              (*steps_)[source_step].begin() + row_end);
+    std::vector<Cindex> cindexes;
+    ConvertToCindexes(source, &cindexes);
+    std::vector<Cindex>::iterator iter = cindexes.begin(),
+        end = cindexes.end();
+    for (; iter != end; ++iter)
+      iter->first = dim_range_node;
+    bool add_if_absent = true;
+    // this add_if_absent says, even if cindexes were not in the graph,
+    // add them.  This is possible in principle; it's to satisfy the
+    // requirement that DimRangeNodes be implemented as contiguous ranges
+    // of rows of component nodes or input nodes.
+    AddStep(cindexes, add_if_absent);
   }
-  all_steps->swap(all_steps_out);
 }
 
+void ComputationStepsComputer::ProcessSubPhase(
+    const ComputationRequest &request,
+    const std::vector<Cindex> &sub_phase) {
+  KALDI_ASSERT(!sub_phase.empty());
+  int32 node_index = sub_phase[0].first;
+  KALDI_ASSERT(sub_phase.back().first == node_index);
+  if (nnet_.IsComponentNode(node_index)) {
+    ProcessComponentStep(sub_phase);
+  } else if (nnet_.IsInputNode(node_index)) {
+    ProcessInputOrOutputStep(request, false, sub_phase);
+  } else if (nnet_.IsOutputNode(node_index)) {
+    ProcessInputOrOutputStep(request, true, sub_phase);
+  } else if (nnet_.IsDimRangeNode(node_index)) {
+    // this might turn out to be multiple steps, see the code.
+    ProcessDimRangeSubPhase(sub_phase);
+  } else if (nnet_.IsComponentInputNode(node_index)) {
+    // We actually do nothing with these sub-phases, because they are processed
+    // when we process the associated component's sub-phase/step.  Doing it this
+    // way resolves certain problems.
+    return;
+  } else {
+    KALDI_ERR << "Unknown node type.";
+  }
+}
 
 
-/// This function would not be necessary if we had not added the ReorderIndexes
-/// function to class Component.  It is responsible for possibly modifying the
-/// order of the inputs and outputs of non-simple Components, and also possibly
-/// removing some inputs if the Component has decided it doesn't need them.  It
-/// may be a while before this is ever used for something.  An example use is
-/// that maybe in convolutional nets or simple models, some components may want,
-/// efficiency or convenience, a certain ordering of the input that differs from
-/// the normal order.
-void ReorderIndexes(const Nnet &nnet,
-                    const ComputationRequest &request,
-                    const ComputationGraph &graph,
-                    std::vector<std::vector<int32> > *steps) {
-
-  for (int32 step = 0; step < steps->size(); step++) {
-    std::vector<int32> &cindex_ids = (*steps)[step];
-    if (cindex_ids.empty()) continue;
-    int32 cindex_id = cindex_ids.front();
-    int32 node_index = graph.cindexes[cindex_id].first;
-    const NetworkNode &node = nnet.GetNode(node_index);
-    if (node.node_type != kComponent ||
-        graph.is_input[cindex_id])
-      continue;  // nothing to do if an input, or if not a Component.
-
-    int32 c = node.u.component_index;
-    const Component *component = nnet.GetComponent(c);
-    if (!(component->Properties() & kReordersIndexes))
-      continue;  // nothing to do if it doesn't modify indexes.
-    KALDI_ASSERT(step > 0);  // or should have continued already.
-
-    // preceding step will be Cindexes from the input Descriptor.
-    std::vector<int32> &input_cindex_ids = (*steps)[step - 1];
-
-    int32 size = cindex_ids.size(), input_size = input_cindex_ids.size();
-    std::vector<Index> indexes(size), input_indexes(input_size);
-
-    for (int32 i = 0; i < size; i++)
-      indexes[i] = graph.cindexes[cindex_ids[i]].second;
-    for (int32 i = 0; i < input_size; i++)
-      input_indexes[i] = graph.cindexes[input_cindex_ids[i]].second;
-
-    component->ReorderIndexes(&input_indexes, &indexes);
-    // size should not change.
-    KALDI_ASSERT(input_indexes.size() == input_size && indexes.size() == size);
-
-    if (size > 0) {
-      int32 node_index = graph.cindexes[cindex_ids.front()].first;
-      for (int32 i = 0; i < size; i++) {
-        Cindex cindex(node_index, indexes[i]);
-        cindex_ids[i] = graph.GetCindexId(cindex);
-      }
-    }
-    if (input_size > 0) {
-      int32 input_node_index = graph.cindexes[input_cindex_ids.front()].first;
-      for (int32 i = 0; i < input_size; i++) {
-        Cindex cindex(input_node_index, input_indexes[i]);
-        input_cindex_ids[i] = graph.GetCindexId(cindex);
-      }
-    }
-    // note: cindex_ids and input_cindex_ids are references, so we have
-    // changed *steps by writing to them in the above two loops.
+void ComputationStepsComputer::Check() const {
+  int32 num_cindexes = graph_->cindexes.size();
+  KALDI_ASSERT(locations_->size() == num_cindexes);
+  for (int32 c = 0; c < num_cindexes; c++) {
+    int32 step = (*locations_)[c].first,
+        row = (*locations_)[c].second;
+    KALDI_ASSERT(step >= 0 && row >= 0 &&
+                 (*steps_)[step][row] == c);
   }
 }
 
-} // namespace compute_computation_steps.
-
-void ComputeComputationSteps(
-    const Nnet &nnet,
-    const ComputationRequest &request,
-    const std::vector<std::vector<int32> > &phases,
-    ComputationGraph *graph,
-    std::vector<std::vector<int32> > *steps) {
-  using namespace compute_computation_steps;
-  AddInputSteps(nnet, request, *graph, steps);
-  {
-    std::vector<std::vector<int32> > component_steps;
-    AddComponentSteps(nnet, *graph, phases, &component_steps);
-    AddComponentInputSteps(*graph, &component_steps, steps);
-  }
-  // output steps don't get reordered so we do the reordering before adding
-  // them.
-  ReorderIndexes(nnet, request, *graph, steps);
-  AddDimRangeSteps(nnet, graph, steps);
-  AddOutputSteps(nnet, request, *graph, steps);
+void ComputationStepsComputer::SplitIntoSubPhases(
+    const std::vector<int32> &phase,
+    std::vector<std::vector<Cindex> > *sub_phases) const {
+  std::vector<Cindex> phase_cindexes;
+  ConvertToCindexes(phase, &phase_cindexes);
+  KALDI_ASSERT(!phase_cindexes.empty());
+  std::sort(phase_cindexes.begin(), phase_cindexes.end());
+  // 'sub_phase_begins' is the indexes onto 'phase_cindees' that
+  // start a run of the same node-index
+  std::vector<size_t> segment_begins;
+  int32 cur_node_index = -1;
+  size_t size = phase_cindexes.size();
+  for (size_t i = 0; i < size; i++) {
+    if (phase_cindexes[i].first != cur_node_index) {
+      cur_node_index = phase_cindexes[i].first;
+      segment_begins.push_back(i);
+    }
+  }
+  size_t num_sub_phases = segment_begins.size();
+  segment_begins.push_back(size);
+  sub_phases->clear();
+  sub_phases->resize(num_sub_phases);
+  for (size_t i = 0; i < num_sub_phases; i++) {
+    size_t this_begin = segment_begins[i],
+        this_end = segment_begins[i+1];
+    (*sub_phases)[i].insert((*sub_phases)[i].end(),
+                            phase_cindexes.begin() + this_begin,
+                            phase_cindexes.begin() + this_end);
+  }
 }
 
 
diff --git a/src/nnet3/nnet-computation-graph.h b/src/nnet3/nnet-computation-graph.h
index 41087123421..863add7fd2d 100644
--- a/src/nnet3/nnet-computation-graph.h
+++ b/src/nnet3/nnet-computation-graph.h
@@ -83,7 +83,7 @@ struct ComputationGraph {
   /// the corresponding "is_input" flag set to the value "input") and set
   /// *is_new to true.  If present, set is_new to false and return the existing
   /// cindex_id.
-  int32 GetCindexId(const Cindex &cindex, bool input, bool *is_new);
+  int32 GetCindexId(const Cindex &cindex, bool is_input, bool *is_new);
 
   /// Const version of GetCindexId that does not add CindexIds.  It will return
   /// -1 if the Cindex is not present, and the user should check for this.
@@ -243,8 +243,10 @@ class ComputationGraphBuilder {
                             std::vector<bool> *required) const;
 
   // this function, to be called from Compute(), does some sanity checks to
-  // verify that the internal state is consistent.
-  void Check() const;
+  // verify that the internal state is consistent.  It only does this for the
+  // current 'segment' of the computation, starting from 'start_cindex_id' (this
+  // will be 0 in normal, single-segment computations).
+  void Check(int32 start_cindex_id) const;
 
   const Nnet &nnet_;
   const ComputationRequest *request_;
@@ -377,7 +379,7 @@ void ComputeComputationPhases(
 
 
 /**
-   This function arranges the cindex_ids of the computation into a sequence of
+   This class arranges the cindex_ids of the computation into a sequence of
    lists called "steps", which will correspond roughly to the commands in the
    compiled computation.  The steps are finer than phases.  (See \ref
    dnn3_compile_steps for more info).  To summarize the properties that
@@ -386,30 +388,157 @@ void ComputeComputationPhases(
   - All cindex_ids within a given step correspond to the same node in the graph
   - All dependencies of cindex_ids within a given step have been computed in
     earlier steps.
+  - All cindex_ids within a given step share the same location when
+    computed (i.e. a matrix or submatix)
 
  There are also some extra, more obscure properties that the sequence of steps
  must satisfy:
-  - Any input or output in the ComputationRequest must be in one step, with the
-    Indexes in the same order as specified in the ComputationRequest.  (Note:
-    inputs can be for nodes of type kComponent as well as kInput).
+
+  - Any input or output specified in a ComputationRequest must be in one step,
+    with the Indexes in the same order as specified in the ComputationRequest.
+    (Note: inputs can be for nodes of type kComponent as well as kInput).
   - If a step corresponds to a node of type kComponent (and does not
     correspond to an input in the ComputationRequest), then the immediately
     preceding step must correspond to a node of type kDescriptor, and the
     sequence of Indexes in the two steps must be identical.
   - If a step corresponds to a node of type kDimRange, then there must be
-    another step corresponding to the source node, with exactly the same
+    a preceding step corresponding to the source node, with exactly the same
     Indexes appearing in the same order.  (This lets us use a sub-matrix for
-    the kDimRange node).
+    the kDimRange node).  We guarantee this by adding extra cindexes to the
+    kDimRange steps as needed.
 
  The reason why computation_graph is not provided as a const argument is that in
  order to ensure the final property we may have to add a few new cindex_ids.
 */
-void ComputeComputationSteps(
-    const Nnet &nnet,
-    const ComputationRequest &request,
-    const std::vector<std::vector<int32> > &phases,
-    ComputationGraph *computation_graph,
-    std::vector<std::vector<int32> > *steps);
+
+class ComputationStepsComputer {
+ public:
+  /// Constructor.
+  ///  @param [in] nnet        The neural network that this computation is for.
+  ///  @param [in,out]  graph  The computation graph that we're computing the steps
+  ///                          for.  It's only non-const because in certain
+  ///                          unusual cases relating to nodes of type kDimRange,
+  ///                          we may need to add new cindexes.
+  ///  @param [out] steps     The main output of this class, which is
+  ///                         a sequence of steps, each step being an ordered list of cindex_ids.
+  ///                         It just gets cleared in the constructor; it's set up
+  ///                         when you call ComputeForSegment().
+  ///  @param [out] locations The additional output of this class, which is a function
+  ///                         of the information in 'steps'.  The array
+  ///                         'locations' is indexed by cindex_id, and each one is a pair
+  ///                         (step-index, index-into-step), so that for any cindex_id c,
+  ///                         (*steps)[locations[c].first][locations[c].second] == c.
+  ///                          It's possible in principle if there are non-simple
+  ///                          Components, that for node corresponding to component-input
+  ///                          descriptors, a cindex might be present in more than one step,
+  ///                          so it doesn't follow that if (*steps)[i][j] == c, then
+  ///                          locations[c] == (i,j).
+  ComputationStepsComputer(const Nnet &nnet,
+                           ComputationGraph *graph,
+                           std::vector<std::vector<int32> > *steps,
+                           std::vector<std::pair<int32, int32> > *locations);
+
+  /// You call this once for each segment, in order (note: for normal,
+  /// non-online computations, there is only one segment).
+  void ComputeForSegment(const ComputationRequest &request,
+                         const std::vector<std::vector<int32> > &phases);
+
+  /// This is only to be called after you have called ComputeForSegment
+  /// for all the segments.
+  void Check() const;
+ private:
+
+  // Adds step(s) for one "sub-phase".  A sub-phase is the set of cindex_ids from
+  // one phase that have the same node index.  Note: for nodes that are
+  // component-input descriptors, we don't actually create the step here, we
+  // create it just before creating the step for its component, and we recreate
+  // the list of cindexes from those from the component.  The reason is that
+  // there are situations where doing it directly from the raw_step would not do
+  // the right thing (especially with non-simple components, it's possible that
+  // the cindexes component-input descriptors could be used twice by two
+  // different components)..
+  void ProcessSubPhase(const ComputationRequest &request,
+                       const std::vector<Cindex> &sub_phase);
+
+  // Called from ProcessSubPhase- for the case where it's a DimRangeNode.
+  void ProcessDimRangeSubPhase(const std::vector<Cindex> &sub_phase);
+
+  // Called from ProcessSubPhase- for the case where it's an input or output node.
+  void ProcessInputOrOutputStep(const ComputationRequest &request,
+                                bool is_output,
+                                const std::vector<Cindex> &sub_phase);
+
+  // Called from ProcessSubPhase- for the case where it's a component node.
+  void ProcessComponentStep(const std::vector<Cindex> &step);
+
+
+  // Splits a phase up into multiple "sub-phases", which are just the cindexes
+  // from a phase that are from a single node, sorted.  At this point we
+  // represent them as Cindexes, not cindex_ids.  For efficiency and because it
+  // would be discarded anyway, it discards any raw steps that correspond to
+  // component-input descriptors because these are not processed inside
+  // ProcessSubPhase().
+  void SplitIntoSubPhases(const std::vector<int32> &phase,
+                          std::vector<std::vector<Cindex> > *sub_phase) const;
+
+  // This low-level function used by functions like ProcessComponentStep,
+  // ProcessInputStep and so on, adds one step to 'steps_' (converting from
+  // Cindex to cindex_ids), and updates 'locations' appropriately.  It returns
+  // the step index that we just added (== size of steps_ at entry).
+  // If you specify add_if_absent = true, it will add any Cindexes that were
+  // not already present, to the graph.  [this option is only to be used
+  // in processing dim-range nodes.
+  int32 AddStep(const std::vector<Cindex> &cindexes,
+                bool add_if_absent = false);
+
+  // This is an alternative interface to AddStep() that takes a list of
+  // cindex_ids instead of cindexes (it's destructive of that list).
+  int32 AddStep(std::vector<int32> *cindex_ids);
+
+
+  // This utility function uses graph_ to convert a vector of cindex_ids into
+  // Cindexes.
+  void ConvertToCindexes(const std::vector<int32> &cindex_ids,
+                         std::vector<Cindex> *cindexes) const;
+
+  // Converts a vector of Cindexes to a vector of Indexes, by
+  // stripping out the node index.
+  static void ConvertToIndexes(const std::vector<Cindex> &cindexes,
+                               std::vector<Index> *indexes);
+
+  // Converts a vector of Indexes to Cindexes, using a supplied
+  // node index.
+  static void ConvertToCindexes(const std::vector<Index> &indexes,
+                                int32 node_index,
+                                std::vector<Cindex> *cindexes);
+
+
+  // This utility function uses graph_ to convert a vector of cindex_ids into
+  // Cindexes.   It will crash if the cindexes were not present in the graph.
+  void ConvertToCindexIds(const std::vector<Cindex> &cindexes,
+                          std::vector<int32> *cindex_ids) const;
+
+  // This utility function uses the 'locations_' array to convert the cindex_ids
+  // in 'cindex_ids' into an array (of the same length) of locations, i.e. of
+  // pairs (step, index-into-step), so that if cindex_ids[i] = c, then
+  // (*locations)[i] will be set to (*locations_)[c].  It will die if
+  // one of the locations was not defined, i.e. was the pair (-1, -1).
+  void ConvertToLocations(
+      const std::vector<int32> &cindex_ids,
+      std::vector<std::pair<int32, int32> > *locations) const;
+
+
+  const Nnet &nnet_;
+  ComputationGraph *graph_;
+  /// steps_ is a pointer to an output that's passed in in the constructor.
+  std::vector<std::vector<int32> > *steps_;
+  /// locations_ is a map from cindex_id to the pair of indexes into steps_ where
+  /// that cindex_id resides, so if (*locations_)[c] = (i,j), then
+  /// (*steps_)[i][j] == c.  This is also an output (we get the pointer in
+  /// the constructor).
+  std::vector<std::pair<int32, int32> > *locations_;
+};
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 37ce355c788..95220b9aae2 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1019,6 +1019,8 @@ void GenerateConfigSequence(
       GenerateConfigSequenceCnn(opts, configs);
       break;
     case 8:
+      if (!opts.allow_use_of_x_dim)
+        goto start;
       GenerateConfigSequenceDistribute(opts, configs);
       break;
     case 9:
diff --git a/src/nnet3/nnet-test-utils.h b/src/nnet3/nnet-test-utils.h
index 18e4960f9bd..d2034dcfdc6 100644
--- a/src/nnet3/nnet-test-utils.h
+++ b/src/nnet3/nnet-test-utils.h
@@ -38,6 +38,7 @@ struct NnetGenerationOptions {
   bool allow_multiple_inputs;
   bool allow_multiple_outputs;
   bool allow_final_nonlinearity;
+  bool allow_use_of_x_dim;
   // if set to a value >0, the output-dim of the network
   // will be set to this value.
   int32 output_dim;
@@ -50,6 +51,7 @@ struct NnetGenerationOptions {
       allow_multiple_inputs(true),
       allow_multiple_outputs(false),
       allow_final_nonlinearity(true),
+      allow_use_of_x_dim(true),
       output_dim(-1) { }
 };
 

From 6d0d9ff3928a3295d3000978e82a76dc681faa63 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 15 Oct 2016 19:27:32 -0400
Subject: [PATCH 011/213] Some minor refactoring to make online computation
 easier (remove unused/unnecessary 'request' args for optimization).

---
 src/nnet3/nnet-analyze.cc         |  1 -
 src/nnet3/nnet-analyze.h          |  3 +--
 src/nnet3/nnet-compute-test.cc    |  2 +-
 src/nnet3/nnet-derivative-test.cc |  4 ++--
 src/nnet3/nnet-optimize-test.cc   |  2 +-
 src/nnet3/nnet-optimize-utils.cc  |  3 +--
 src/nnet3/nnet-optimize-utils.h   |  2 --
 src/nnet3/nnet-optimize.cc        | 36 +++++++++++++++----------------
 src/nnet3/nnet-optimize.h         |  3 ---
 9 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 2176837a7d9..896be0e437c 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -982,7 +982,6 @@ void ComputationChecker::CheckComputationDebugInfo() const {
 }
 
 void CheckComputation(const Nnet &nnet,
-                      const ComputationRequest &request,
                       const NnetComputation &computation,
                       bool check_rewrite) {
   CheckComputationOptions opts;
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 8b02d6376e9..c1911d36457 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -417,9 +417,8 @@ class ComputationChecker {
 
 
 /// This is a convenience interface for class ComputationChecker.  Call it with
-/// check_rewrite = true only if the optimization is pre-optimization.
+/// check_rewrite = true only if the computation is pre-optimization.
 void CheckComputation(const Nnet &nnet,
-                      const ComputationRequest &request,
                       const NnetComputation &computation,
                       bool check_rewrite = false);
 
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index afe7da86dc1..c485cc06636 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -119,7 +119,7 @@ void UnitTestNnetCompute() {
     if (RandInt(0, 1) == 0) {
       NnetOptimizeOptions opt_config;
 
-      Optimize(opt_config, nnet, request, &computation);
+      Optimize(opt_config, nnet, &computation);
       {
         std::ostringstream os;
         computation.Print(os, nnet);
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index 0f5f2f6d54a..511a6dc6bf9 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -139,7 +139,7 @@ void UnitTestNnetModelDerivatives() {
       if (limit_deriv_times)
         SetDerivTimesOptions(request, &opt_config);
 
-      Optimize(opt_config, nnet, request, &computation);
+      Optimize(opt_config, nnet, &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
@@ -303,7 +303,7 @@ void UnitTestNnetInputDerivatives() {
     if (RandInt(0, 3) != 0 && allow_optimization) {
       NnetOptimizeOptions opt_config;
       // opt_config.initialize_undefined = false;  // temp
-      Optimize(opt_config, nnet, request, &computation);
+      Optimize(opt_config, nnet, &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 97662acc556..7b64d67b72c 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -71,7 +71,7 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     NnetComputation computation_opt(computation);
 
     {
-      Optimize(opt_config, nnet, request, &computation_opt);
+      Optimize(opt_config, nnet, &computation_opt);
       std::ostringstream os;
       computation_opt.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index b2f171a0670..eaf026b7740 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -563,9 +563,8 @@ void RemoveNoOps(NnetComputation *computation) {
 VariableMergingOptimizer::VariableMergingOptimizer(
     const NnetOptimizeOptions &config,
     const Nnet &nnet,
-    const ComputationRequest &request,
     NnetComputation *computation):
-    config_(config), nnet_(nnet), request_(request),
+    config_(config), nnet_(nnet),
     computation_(computation),
     already_called_merge_variables_(false) {
   analyzer_.Init(nnet, *computation);
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 9b3b640d817..b8957abf6fa 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -131,7 +131,6 @@ class VariableMergingOptimizer {
  public:
   VariableMergingOptimizer(const NnetOptimizeOptions &config,
                            const Nnet &nnet,
-                           const ComputationRequest &request,
                            NnetComputation *computation);
   // Note: you can call this only once.  If it returns true, it means it has
   // merged variables.  In this case, you have the option to instantiate another
@@ -168,7 +167,6 @@ class VariableMergingOptimizer {
 
   const NnetOptimizeOptions &config_;
   const Nnet &nnet_;
-  const ComputationRequest &request_;
   NnetComputation *computation_;
 
   Analyzer analyzer_;
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index f2759d7705d..4cb18d26ea4 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -341,12 +341,11 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
 
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
-                                 const ComputationRequest &request,
                                  NnetComputation *computation) {
   bool changed = true;
   while (changed) {
     changed = false;
-    VariableMergingOptimizer opt(config, nnet, request, computation);
+    VariableMergingOptimizer opt(config, nnet, computation);
     if (opt.MergeVariables())
       changed = true;
   }
@@ -355,10 +354,12 @@ void VariableMergingOptimization(const NnetOptimizeOptions &config,
 // This is a simplified top-level interface to the model-update consolidation
 // code from class ModelUpdateConsolidator.
 void ConsolidateModelUpdate(const Nnet &nnet,
-                            const ComputationRequest &request,
                             NnetComputation *computation) {
-  if (!request.need_model_derivative)
-    return;   // An optimization; there would be nothing to do in this case.
+  // This following if-statement is an optimization: if the computation
+  // request(s) had need_model_derivative == false, there would be nothing to
+  // optimize, so don't bother trying.
+  if (!computation->need_model_derivative)
+    return;
   ModelUpdateConsolidator consolidator(nnet, computation);
   consolidator.ConsolidateModelUpdate();
 }
@@ -416,13 +417,12 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
 
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              const ComputationRequest &request,
               NnetComputation *computation) {
   if (!config.optimize)
     return;
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+    CheckComputation(nnet, *computation, true);
 
   { // Call LimitDerivativeTimes().
     // this will do nothing unless --min-deriv-time or --max-deriv-time
@@ -436,44 +436,44 @@ void Optimize(const NnetOptimizeOptions &config,
   }
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+    CheckComputation(nnet, *computation, true);
 
   if (config.consolidate_model_update)
-    ConsolidateModelUpdate(nnet, request, computation);
+    ConsolidateModelUpdate(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+    CheckComputation(nnet, *computation, true);
 
   if (config.convert_addition)
     ConvertAdditionToAssignment(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+    CheckComputation(nnet, *computation, true);
 
   if (config.remove_assignments || config.backprop_in_place ||
       config.propagate_in_place)
-    VariableMergingOptimization(config, nnet, request, computation);
+    VariableMergingOptimization(config, nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+    CheckComputation(nnet, *computation, false);
 
   if (config.initialize_undefined)
     RemoveUnnecessaryZeroing(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+    CheckComputation(nnet, *computation, false);
 
   if (config.move_sizing_commands)
     MoveSizingCommands(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+    CheckComputation(nnet, *computation, false);
 
   if (config.allocate_from_other)
     RemoveUnnecessaryAllocation(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+    CheckComputation(nnet, *computation, false);
 
   // The following is not configurable because it is necessary for
   // the computation to run correctly (we do it after compilation too,
@@ -482,7 +482,7 @@ void Optimize(const NnetOptimizeOptions &config,
   ConsolidateIoOperations(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+    CheckComputation(nnet, *computation, false);
 }
 
 // ComputationRequests are distinguished by the names and indexes
@@ -633,7 +633,7 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
       ComputationChecker checker(check_config, nnet_, *computation);
       checker.Check();
     }
-    Optimize(opt_config_, nnet_, *request, computation);
+    Optimize(opt_config_, nnet_, computation);
     if (GetVerboseLevel() >= verbose_cutoff) {
       std::ostringstream os;
       computation->Print(os, nnet_);
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index f38e4d854ff..cb5bab0d462 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -117,7 +117,6 @@ struct NnetOptimizeOptions {
 /// This is the top-level function for optimizing a computation.
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              const ComputationRequest &request,
               NnetComputation *computation);
 
 // Hash function for ComputationRequest. It converts
@@ -265,7 +264,6 @@ void LimitDerivativeTimes(const Nnet &nnet,
 /// class ModelUpdateConsolidator.  Will fail if called a
 /// second time.
 void ConsolidateModelUpdate(const Nnet &nnet,
-                            const ComputationRequest &request,
                             NnetComputation *computation);
 
 /// This converts addition operations (things with Add in their names) to
@@ -278,7 +276,6 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
 /// This wraps class VariableMergingOptimizer in a simplified interface.
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
-                                 const ComputationRequest &request,
                                  NnetComputation *computation);
 
 

From 7a53e666aa55f1d2c2a02e3abd17f4782bbdd72a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 25 Oct 2016 20:37:31 -0400
Subject: [PATCH 012/213] Further progress [note, this is partial work, backing
 up.  Search for TODO.

---
 src/matrix/compressed-matrix.h      |  10 +-
 src/nnet3/Makefile                  |   4 +-
 src/nnet3/nnet-analyze.cc           |   9 ++
 src/nnet3/nnet-analyze.h            |   8 ++
 src/nnet3/nnet-common.cc            |  21 +++-
 src/nnet3/nnet-common.h             |   5 +
 src/nnet3/nnet-compile-test.cc      |  60 +++++++++-
 src/nnet3/nnet-compile.h            |   2 +-
 src/nnet3/nnet-computation-graph.cc |   2 +-
 src/nnet3/nnet-computation.cc       |   2 +-
 src/nnet3/nnet-example-utils.h      |   2 +-
 src/nnet3/nnet-nnet.h               |  20 ++--
 src/nnet3/nnet-optimize-utils.cc    | 175 ++++++++++++++++++++++------
 src/nnet3/nnet-optimize-utils.h     |  82 ++-----------
 src/nnet3/nnet-optimize.h           |   4 +-
 src/nnet3/nnet-test-utils.cc        |  10 +-
 src/nnet3/nnet-test-utils.h         |   2 +
 src/nnet3/nnet-utils.cc             | 115 +++++++++++++-----
 src/nnet3/nnet-utils.h              |   4 +-
 19 files changed, 371 insertions(+), 166 deletions(-)

diff --git a/src/matrix/compressed-matrix.h b/src/matrix/compressed-matrix.h
index 603134ab800..4e4238c43da 100644
--- a/src/matrix/compressed-matrix.h
+++ b/src/matrix/compressed-matrix.h
@@ -47,7 +47,7 @@ class CompressedMatrix {
   CompressedMatrix(): data_(NULL) { }
 
   ~CompressedMatrix() { Clear(); }
-  
+
   template<typename Real>
   CompressedMatrix(const MatrixBase<Real> &mat): data_(NULL) { CopyFromMat(mat); }
 
@@ -73,7 +73,7 @@ class CompressedMatrix {
 
   template<typename Real>
   CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
-  
+
   /// Copies contents to matrix.  Note: mat must have the correct size.
   /// kNoTrans case uses a temporary.
   template<typename Real>
@@ -81,7 +81,7 @@ class CompressedMatrix {
                  MatrixTransposeType trans = kNoTrans) const;
 
   void Write(std::ostream &os, bool binary) const;
-  
+
   void Read(std::istream &is, bool binary);
 
   /// Returns number of rows (or zero for emtpy matrix).
@@ -113,7 +113,7 @@ class CompressedMatrix {
   void Swap(CompressedMatrix *other) { std::swap(data_, other->data_); }
 
   void Clear();
-  
+
   friend class Matrix<float>;
   friend class Matrix<double>;
  private:
@@ -163,7 +163,7 @@ class CompressedMatrix {
   static inline float CharToFloat(float p0, float p25,
                                   float p75, float p100,
                                   unsigned char value);
-  
+
   void *data_; // first GlobalHeader, then PerColHeader (repeated), then
   // the byte data for each column (repeated).  Note: don't intersperse
   // the byte data with the PerColHeaders, because of alignment issues.
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 65384f5a338..60629ab1cbe 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -28,7 +28,7 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   discriminative-supervision.o nnet-discriminative-example.o \
   nnet-discriminative-diagnostics.o \
   discriminative-training.o nnet-discriminative-training.o \
-  online-nnet3-decodable-simple.o
+  online-nnet3-decodable-simple.o nnet-compile-online.o
 
 
 LIBNAME = kaldi-nnet3
@@ -37,6 +37,6 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 896be0e437c..5513ce24a92 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -1241,5 +1241,14 @@ void Analyzer::Init(const Nnet &nnet, const NnetComputation &computation) {
                         &matrix_accesses);
 }
 
+void GetSegmentEnds(const NnetComputation &computation,
+                    std::vector<int32> *command_indexes) {
+  int32 num_commands = computation.commands.size();
+  command_indexes->clear();
+  for (int32 c = 0; c < num_commands; c++)
+    if (computation.commands[c].command_type == kNoOperationMarker)
+      command_indexes->push_back(c);
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index c1911d36457..7109575e415 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -416,6 +416,14 @@ class ComputationChecker {
 };
 
 
+/// This utility function works out from a computation, the locations of the
+/// 'segment ends'.  This is useful for online compilation, where the
+/// computation has multiple segments corresponding to new pieces of input data
+/// to process.  The implementation of the function is extremely simple; it
+/// just gives you the locations of commands of type 'kNoOperationMarker'.
+void GetSegmentEnds(const NnetComputation &computation,
+                    std::vector<int32> *command_indexes);
+
 /// This is a convenience interface for class ComputationChecker.  Call it with
 /// check_rewrite = true only if the computation is pre-optimization.
 void CheckComputation(const Nnet &nnet,
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 918055df62d..412fc71341a 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -167,7 +167,7 @@ static void WriteCindexVectorElementBinary(
     // this separator.
     os.put('|');
     WriteBasicType(os, binary, node_index);
-  }  
+  }
   if (i == 0) {
     // we don't need to be concerned about reserving space for character 124
     // ('|') here, since (wastefully) '|' is always printed for i == 0.
@@ -280,11 +280,11 @@ void WriteCindexVector(std::ostream &os, bool binary,
         os.put('[');
         WriteBasicType(os, binary, node_index);
         os.put(':');
-      } 
+      }
       vec[i].second.Write(os, binary);
       if (i == size - 1)
         os.put(']');
-    } 
+    }
   } else {
     for (int32 i = 0; i < size; i++)
       WriteCindexVectorElementBinary(os, vec, i);
@@ -326,7 +326,7 @@ void ReadCindexVector(std::istream &is, bool binary,
         (*vec)[i].first = (*vec)[i-1].first;
       }
       (*vec)[i].second.Read(is, binary);
-      if (i == size - 1) { 
+      if (i == size - 1) {
         is >> std::ws;
         if (is.peek() == static_cast<int>(']')) {
           is.get();
@@ -358,6 +358,19 @@ size_t CindexHasher::operator () (const Cindex &cindex) const {
 
 }
 
+size_t CindexVectorHasher::operator () (
+    const std::vector<Cindex> &cindex_vector) const {
+  // this is an arbitrarily chosen prime.
+  size_t kPrime = 23539, ans = 0;
+  std::vector<Cindex>::const_iterator iter = cindex_vector.begin(),
+      end = cindex_vector.end();
+  CindexHasher cindex_hasher;
+  for (; iter != end; ++iter)
+    ans = cindex_hasher(*iter) + kPrime * ans;
+  return ans;
+}
+
+
 std::ostream &operator << (std::ostream &ostream, const Index &index) {
   return ostream << '(' << index.n << ' ' << index.t << ' ' << index.x << ')';
 }
diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index f8140e62f12..e6e3abe705e 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -108,6 +108,11 @@ struct CindexHasher {
 };
 
 
+struct CindexVectorHasher {
+  size_t operator () (const std::vector<Cindex> &cindex_vector) const;
+};
+
+
 
 // this will only be used for pretty-printing.
 void PrintCindex(std::ostream &ostream, const Cindex &cindex,
diff --git a/src/nnet3/nnet-compile-test.cc b/src/nnet3/nnet-compile-test.cc
index c0e1b6f8b5b..da08253093a 100644
--- a/src/nnet3/nnet-compile-test.cc
+++ b/src/nnet3/nnet-compile-test.cc
@@ -19,6 +19,7 @@
 
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-compile.h"
+#include "nnet3/nnet-compile-online.h"
 #include "nnet3/nnet-test-utils.h"
 
 namespace kaldi {
@@ -115,17 +116,74 @@ void UnitTestNnetCompileMulti() {
 }
 
 
+
+void UnitTestNnetCompileOnline() {
+  for (int32 n = 0; n < 20; n++) {
+    struct NnetGenerationOptions gen_config;
+    gen_config.allow_ivector = true;
+
+    std::vector<std::string> configs;
+    GenerateConfigSequence(gen_config, &configs);
+    Nnet nnet;
+    for (size_t j = 0; j < configs.size(); j++) {
+      KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
+      std::istringstream is(configs[j]);
+      nnet.ReadConfig(is);
+    }
+
+    ComputationRequest request1, request2, request3;
+    int32 chunk_size_min = RandInt(5, 15);
+    int32 frame_subsampling_factor = RandInt(1, 3),
+        extra_left_context_begin = RandInt(0, 10),
+        extra_right_context = RandInt(0, 10),
+        num_sequences = RandInt(1, 2);
+    int32 chunk_size = GetChunkSize(nnet, frame_subsampling_factor,
+                                    chunk_size_min),
+        ivector_period = chunk_size;
+
+
+
+    ModifyNnetIvectorPeriod(ivector_period, &nnet);
+    KALDI_LOG << "Nnet info after modifying ivector period is: "
+              << nnet.Info();
+    CreateOnlineComputationRequestSimple(
+        nnet, chunk_size, frame_subsampling_factor,
+        ivector_period, extra_left_context_begin, extra_right_context,
+        num_sequences, &request1, &request2, &request3);
+
+    KALDI_LOG << "Computation request 1 is:";
+    request1.Print(std::cerr);
+    KALDI_LOG << "Computation request 2 is:";
+    request2.Print(std::cerr);
+    KALDI_LOG << "Computation request 3 is:";
+    request3.Print(std::cerr);
+
+    NnetOptimizeOptions optimize_opts;
+    // todo: set optimize-online=true.
+    NnetComputation computation;
+    CompileOnline(nnet, optimize_opts,
+                  request1, request2, request3,
+                  &computation);
+    KALDI_LOG << "Compiled online computation is ";
+    computation.Print(std::cerr, nnet);
+  }
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  // SetVerboseLevel(2);
+  SetVerboseLevel(2);
 
+  UnitTestNnetCompileOnline();
   UnitTestNnetCompile();
   UnitTestNnetCompileMulti();
 
+
   KALDI_LOG << "Nnet tests succeeded.";
 
   return 0;
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index 36fcf84fbf1..20114206ceb 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -1,6 +1,6 @@
 // nnet3/nnet-compile.h
 
-// Copyright 2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015-2016    Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index 422a14bfe4c..1761dd1b775 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -956,7 +956,7 @@ void ComputationGraphBuilder::ComputeRequiredArray(
         end = dependencies.end();
     for (; iter != end; ++iter) {
       int32 d = *iter;
-      if (!(*required)[d - start_cindex_id]){
+      if (d >= start_cindex_id && !(*required)[d - start_cindex_id]){
         (*required)[d - start_cindex_id] = true;
         queue.push_back(d);
       }
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index c58fb87dde4..21ab88f5f12 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -606,7 +606,7 @@ static void PrintCommand(std::ostream &os,
       os << "[no-op]\n";
       break;
     case kNoOperationMarker:
-      os << "# begin backward commands\n";
+      os << "# computation segment separator [e.g., begin backward commands]\n";
       break;
     default:
       KALDI_ERR << "Un-handled command type.";
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 3e309e18915..6ebffcf1d50 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -51,7 +51,7 @@ void ShiftExampleTimes(int32 t_offset,
 
 /**  This function takes a NnetExample (which should already have been
      frame-selected, if desired, and merged into a minibatch) and produces a
-     ComputationRequest.  It ssumes you don't want the derivatives w.r.t. the
+     ComputationRequest.  It assumes you don't want the derivatives w.r.t. the
      inputs; if you do, you can create/modify the ComputationRequest manually.
      Assumes that if need_model_derivative is true, you will be supplying
      derivatives w.r.t. all outputs.
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index 16e8333d5b1..e999f20f4f5 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -249,7 +249,17 @@ class Nnet {
   void ResetGenerators(); // resets random-number generators for all
   // random components.  You must also set srand() for this to be
   // effective.
-  
+
+
+  // This function outputs to "config_lines" the lines of a config file.  If you
+  // provide include_dim=false, this will enable you to reconstruct the nodes in
+  // the network (but not the components, which need to be written separately).
+  // If you provide include_dim=true, it also adds extra information about
+  // node dimensions which is useful for a human reader but won't be
+  // accepted as the config-file format.
+  void GetConfigLines(bool include_dim,
+                      std::vector<std::string> *config_lines) const;
+
  private:
 
   void Destroy();
@@ -261,14 +271,6 @@ class Nnet {
   // include dimension information that would not be provided in a config file.
   std::string GetAsConfigLine(int32 node_index, bool include_dim) const;
 
-  // This function outputs to "config_lines" the lines of a config file.  If you
-  // provide include_dim=false, this will enable you to reconstruct the nodes in
-  // the network (but not the components, which need to be written separately).
-  // If you provide include_dim=true, it also adds extra information about
-  // node dimensions which is useful for a human reader but won't be
-  // accepted as the config-file format.
-  void GetConfigLines(bool include_dim,
-                      std::vector<std::string> *config_lines) const;
 
   // This function is used when reading config files; it exists in order to
   // handle replacement of existing nodes.  The two input vectors have the same
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index eaf026b7740..3b7dda18e96 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -1652,23 +1652,6 @@ void DerivativeTimeLimiter::PruneMatrices() {
     LimitMatrices(will_limit);
 }
 
-
-int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
-  int32 ans = std::numeric_limits<int32>::min();
-  for (size_t i = 0; i < request.outputs.size(); i++) {
-    std::vector<Index> indexes &indexes = request.outputs[i].indexes;
-    std::vector<Index> indexes::const_iterator iter = indexes.begin(),
-        end = indexes.end();
-    for (; iter != end; ++iter)
-      if (iter.t > ans)
-        ans = iter.t;
-  }
-  if (ans == std::numeric_limits<int32>::min()) {
-    KALDI_ERR << "Failed to find any output indexes in computation request.";
-  }
-  return ans;
-}
-
 void LimitDerivativeTimes(const Nnet &nnet,
                           int32 min_deriv_time,
                           int32 max_deriv_time,
@@ -1678,33 +1661,153 @@ void LimitDerivativeTimes(const Nnet &nnet,
   limiter.LimitDerivTimes();
 }
 
-// This class implements the internals of the ExpandComputation() function.
-class ComputationExpander {
+
+class ComputationOnlineOptimizer {
  public:
-  ComputationExpander(const Computation &computation,
-                      bool need_debug_info,
-                      int32 num_n_values,
-                      Computation *expanded_computation):
-      computation_(computation),
-      need_debug_info_(need_debug_info),
-      num_n_values_(num_n_values),
-      expanded_computation_(expanded_computation) { }
-
-  // This function call implements the functionality of the class,
-  // expanding the computation.
-  bool Expand();
+  ComputationOnlineOptimizer(const Nnet &nnet,
+                             NnetComputation *computation):
+      nnet_(nnet), computation_(computation) { }
+  bool Optimize();
 
  private:
-
-  const Computation &computation_;
-  bool need_debug_info_;
-  int32 num_n_values_;
-  Computation *expanded_computation_;
+  // This function creates a mapping from a matrix-index > 0,
+  // to a pair (time_offset, unique_id) that represents the debug-info
+  // for that matrix-id in computation.debug_info.
+  // The output vector is indexed by the matrix-index in the computation (the
+  // zeroth member is not valid).  It requires that the
+  // The 'time_offset' is equal to the 't' value of the zeroth element of the
+  // cindexes vetor.  The 'unique_id' is an integer that uniquely identifies
+  // what we get from subtracting the 'time_offset' from each 't' value of
+  // that 'cindexes' vector, and then pairing it up with the 'is_deriv'
+  // value of the DebugInfo.  That is, if two 'cindexes' vectors differ only
+  // by a time offset, and the 'is_deriv' values are the same they will map to the same
+  // unique_id.
+  static void CreateMatrixPairs(const NnetComputation &computation,
+                                std::vector<std::pair<int32, int32> > *matrix_to_pair);
+
+
+  /// Given a list of command indexes ('segment_end_commands') which are
+  /// expected to be command indexes of the kNoOperationMarker at segment
+  /// boundaries, this function outputs for each of these command indexes a list
+  /// of matrices which are 'active' at that point in time.  By 'active' we mean
+  /// that the matrix has been written to before that time (note, we don't count
+  /// initialization with zeros as being written to); and will be read after
+  /// that time.  These is the list of matrices that 'need to be in scope'
+  /// at those points in time.  '*active_matrices' is indexed by the
+  /// same index as 'segment_end_commands', and is then a list of active
+  /// matrices, in numerical order of matrix index.
+  static void FindActiveMatrices(const NnetComputation &computation,
+                                 const std::vector<int32> &segment_end_commands,
+                                 const Analyzer &analyzer,
+                                 std::vector<std::vector<int32> > *active_matrices);
+
+
+  const Nnet &nnet_;
+  NnetComputation *computation_;
+  Analyzer analyzer_;
+  std::vector<std::pair<int32, int32> > matrix_to_pair_;
+
+  std::vector<int32> segment_end_commands_;
 
 
 };
 
 
+// static
+void ComputationOnlineOptimizer::CreateMatrixPairs(
+    const NnetComputation &computation,
+    std::vector<std::pair<int32, int32> > *matrix_to_pair) {
+  typedef unordered_map<std::vector<Cindex>, int32,
+                        CindexVectorHasher> MapType;
+  int32 cur_vector_id = 1;
+  // Note: cindex_map just maps the vector<Cindex> to a unique value,
+  // and then we manually work out a unique id that takes into
+  // account the 'is_deriv' values.
+  MapType cindex_map;
+  int32 num_matrices = computation.matrices.size();
+  matrix_to_pair->resize(num_matrices);
+  KALDI_ASSERT(computation.matrix_debug_info.size() == num_matrices);
+  for (int32 m = 1; m < num_matrices; m++) {
+    KALDI_ASSERT(!computation.matrix_debug_info[m].cindexes.empty());
+    std::vector<Cindex> cindexes = computation.matrix_debug_info[m].cindexes;
+    int32 t_offset = cindexes[0].second.t;
+    for (std::vector<Cindex>::iterator iter = cindexes.begin();
+         iter != cindexes.end(); ++iter)
+      iter->second.t -= t_offset;
+    MapType::const_iterator iter = cindex_map.find(cindexes);
+    int32 vector_id;
+    if (iter != cindex_map.end()) {
+      vector_id = iter->second;
+    } else {
+      vector_id = cur_vector_id++;
+      cindex_map[cindexes] = vector_id;
+    }
+    bool is_deriv = computation.matrix_debug_info[m].is_deriv;
+    int32 unique_id = 2 * vector_id + (is_deriv ? 1 : 0);
+    (*matrix_to_pair)[m].first = t_offset;
+    (*matrix_to_pair)[m].second = unique_id;
+  }
+}
+
+
+// static
+void ComputationOnlineOptimizer::FindActiveMatrices(
+    const NnetComputation &computation,
+    const std::vector<int32> &segment_end_commands,
+    const Analyzer &analyzer,
+    std::vector<std::vector<int32> > *active_matrices) {
+  int32 num_matrices = computation.matrices.size();
+  int32 num_segments = segment_end_commands.size();
+  active_matrices->clear();
+  active_matrices->resize(num_segments);
+  // this object just makes available some extra functions.
+  ComputationAnalysis analysis(computation, analyzer);
+  for (int32 s = 0; s + 1 < num_segments; s++) {
+    KALDI_ASSERT(segment_end_commands[s] < segment_end_commands[s+1]);
+  }
+  // the following vector gives us, for each matrix index, a submatrix index
+  // that covers the whole of that matrix (needed by interface of 'analysis' object).
+  std::vector<int32> whole_submatrices;
+  computation.GetWholeSubmatrices(&whole_submatrices);
+  for (int32 m = 1; m < num_matrices; m++) {
+    // the following are command indexes, comparable with the indexes
+    // in 'segment_end_commands'.
+    int32 s = whole_submatrices[m];  // submatrix consisting of the whole of
+                                     // 'm'.
+    int32 first_access = analysis.FirstAccess(s),
+        last_access = analysis.LastAccess(s);
+    std::vector<int32>::const_iterator iter = segment_end_commands.begin(),
+        end = segment_end_commands.end();
+    for (; iter != end; ++iter) {
+      int32 segment_end = *iter;
+      if (first_access < segment_end && last_access > segment_end) {
+        // TODO.
+      }
+    }
+  }
+
+}
+
+bool ComputationOnlineOptimizer::Optimize() {
+  analyzer_.Init(nnet_, *computation_);
+  KALDI_ASSERT(!computation_->matrix_debug_info.empty() &&
+               "You must request matrix debug info when compiling "
+               "online computations.");
+
+  // TODO.
+
+  return false;
+}
+
+
+bool OptimizeOnlineComputation(const Nnet &nnet,
+                               NnetComputation *computation) {
+  ComputationOnlineOptimizer optimizer(nnet, computation);
+  return optimizer.Optimize();
+}
+
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index b8957abf6fa..11a04354016 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -498,12 +498,6 @@ class DerivativeTimeLimiter {
   std::vector<MatrixPruneInfo> prune_info_;
 };
 
-
-// This utility function, used in code that calls LimitDerivativeTimes(), returns
-// the largest time 't' in any of the 'outputs' in the computation request,
-// or crashes if there are no outputs (or no cindexes in those outputs).
-int32 MaxOutputTimeInRequest(const ComputationRequest &request);
-
 // This is the top-level interface to limit the times on which derivatives are
 // computed (e.g. for truncated BPTT); internally it uses class
 // DerivativeLimiter.  Will do nothing if min_deriv_time and max_deriv_time are
@@ -513,68 +507,6 @@ void LimitDerivativeTimes(const Nnet &nnet,
                           int32 max_deriv_time,
                           NnetComputation *computation);
 
-
-/**  This function, used in 'shortcut' compilation where we first compile a
-     smaller computation with the same structure but only 2 distinct 'n'
-     values, works out whether a computation is 'decomposable'; if so,
-     it returns true and outputs the 'mini_request' with the same structure,
-     and the number of 'n' values.
-
-     A computation is decomposable if the following conditions hold:
-
-      - All of its inputs and outputs contain 'n' values for all 0 <= n < N,
-        for some N > 2.  [we output this 'N' as 'num_n_values'].
-      - All of its inputs and outputs have 'regular' structure.
-
-        What it means for an input or output (i.e. an IoSpecification) to have a
-        'regular' structure, is as follows:
-          - The 't' and 'x' values present are the same for each 'n',
-          - The order in which the indexes appear is EITHER of the following:
-             - The 'n' varies the most rapidly, i.e. the order is:
-                 (t1,x1,0), (t1,x1,1) ... (t1,x1,N-1) \
-                 (t2,x2,0), (t2,x2,1) ... (t2,x2,N-1)  ...
-             - The 'n' varies the least rapidly, i.e. the order is:
-                 (t1,x1,0), (t2,x2,0) ...  \
-                 (t1,x1,1), (t2,x2,1) ...  \
-                 ...                       \
-                 (t1,x2,N-1), (t2,x2,N-1) ...
-            In either case, there does not have to be any particular rhyme or
-            reason to the order of the t and x values, the regularity on 'n' is
-            all that we care about.
- */
-bool ComputationIsDecomposable(const ComputationRequest &request,
-                               ComputationRequest *mini_request,
-                               int32 *num_n_values);  // TODO: implement this.
-
-
-/**
-  This function is used in 'shortcut' compilation to expand a computation
-  that has been compiled for exactly 2 'n' values, to one that is suitable
-  for some num_n_values > 2.
-     @param [in] computation  The computation that was compiled for exactly
-                              2 'n' values (n=0 and n=1)
-     @param [in] need_debug_info True if we want to retain the 'debug_info'
-                              in the output 'expanded_computation'.  In any
-                              case, the 'debug_info' is required in the
-                              input computation.
-     @param [in] num_n_values The number of 'n' values we want in the output
-                              computation
-     @param [out] expanded_computation  The expanded computation.
-
-     @return  This function returns true if it succeeded, and false if it
-              could not expand the computation for some reason (e.g. there
-              was some non-simple component where the 'PrecomputedIndexes'
-              object could not be suitably expanded.  If it returns false,
-              the output 'expanded_computation' is undefined (may contain junk).
- */
-bool ExpandComputation(const Computation &computation,
-                       bool need_debug_info,
-                       int32 num_n_values,
-                       Computation *expanded_computation);
-
-
-
-
 /// This function detects submatrices, matrices, and members of indexes_multi
 /// and indexes that are never used (e.g. due to changes made in other
 /// optimization code), and removes them from the computation by way of suitable
@@ -600,7 +532,6 @@ void IdentifySubmatrixArgs(NnetComputation::Command *command,
 void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
                            std::vector<int32*> *submatrix_args);
 
-
 /// This function outputs to "submatrix_args" the addresses of integers in
 /// 'computation' that correspond to submatrices.  These may be present in
 /// 'commands', and in 'indexes_multi'.  This is useful in renumbering code.
@@ -635,7 +566,18 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
-
+/// This function tries to optimize computation 'computation' for an 'online'
+/// computation.  It expects as input a computation with no backprop but with
+/// multiple 'segments' separated by command kNoOperation, where each segment
+/// corresponds to a new chunk of input and output.  It tries to locate a pair
+/// of segment boundaries, with command indexes c1 and c2, where the active
+/// matrices have the same debug-info other than a time offset and can be
+/// identified with each other, and the no-op command at c2 can be replaced with
+/// 'got c1', creating a computation that 'goes on forever'.
+/// It returns true if it successfully did this.  [If this happens, the
+/// whole computation may have to be regenerated with more segments.]
+bool OptimizeOnlineComputation(const Nnet &nnet,
+                               NnetComputation *computation);
 
 
 /*
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index cb5bab0d462..ce9e4de240a 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-optimize.h
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-//                2015  Xiaohui Zhang
+// Copyright      2015-2016  Johns Hopkins University (author: Daniel Povey)
+//                2015       Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 95220b9aae2..eca7c6b2075 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -104,11 +104,15 @@ void GenerateConfigSequenceSimple(
     splice_context.push_back(0);
 
   int32 input_dim = 10 + Rand() % 20,
-      spliced_dim = input_dim * splice_context.size(),
       output_dim = (opts.output_dim > 0 ?
                     opts.output_dim :
                     100 + Rand() % 200),
       hidden_dim = 40 + Rand() % 50;
+  int32 ivector_dim = 10 + Rand() % 20;
+  if (RandInt(0, 1) == 0 || !opts.allow_ivector)
+    ivector_dim = 0;
+  int32 spliced_dim = input_dim * splice_context.size() + ivector_dim;
+
   bool use_final_nonlinearity = (opts.allow_final_nonlinearity &&
                                  RandInt(0, 1) == 0);
   os << "component name=affine1 type=NaturalGradientAffineComponent input-dim="
@@ -127,8 +131,12 @@ void GenerateConfigSequenceSimple(
     }
   }
   os << "input-node name=input dim=" << input_dim << std::endl;
+  if (ivector_dim != 0)
+    os << "input-node name=ivector dim=" << ivector_dim << std::endl;
 
   os << "component-node name=affine1_node component=affine1 input=Append(";
+  if (ivector_dim != 0)
+    os << "ReplaceIndex(ivector, t, 0), ";
   for (size_t i = 0; i < splice_context.size(); i++) {
     int32 offset = splice_context[i];
     os << "Offset(input, " << offset << ")";
diff --git a/src/nnet3/nnet-test-utils.h b/src/nnet3/nnet-test-utils.h
index d2034dcfdc6..b6976f70ab1 100644
--- a/src/nnet3/nnet-test-utils.h
+++ b/src/nnet3/nnet-test-utils.h
@@ -39,6 +39,7 @@ struct NnetGenerationOptions {
   bool allow_multiple_outputs;
   bool allow_final_nonlinearity;
   bool allow_use_of_x_dim;
+  bool allow_ivector;
   // if set to a value >0, the output-dim of the network
   // will be set to this value.
   int32 output_dim;
@@ -52,6 +53,7 @@ struct NnetGenerationOptions {
       allow_multiple_outputs(false),
       allow_final_nonlinearity(true),
       allow_use_of_x_dim(true),
+      allow_ivector(false),
       output_dim(-1) { }
 };
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 999789650b5..ed20257c7fe 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -71,10 +71,10 @@ void EvaluateComputationRequest(
   ComputationGraphBuilder builder(nnet, &graph);
   builder.Compute(request);
   builder.GetComputableInfo(is_computable);
-  if (GetVerboseLevel() >= 4) {
+  if (GetVerboseLevel() >= 2) {
     std::ostringstream graph_pretty;
     graph.Print(graph_pretty, nnet.GetNodeNames());
-    KALDI_VLOG(4) << "Graph is " << graph_pretty.str();
+    KALDI_VLOG(3) << "Graph is " << graph_pretty.str();
   }
 }
 
@@ -103,9 +103,16 @@ static void ComputeSimpleNnetContextForShift(
     input.indexes.push_back(Index(n, t));
     output.indexes.push_back(Index(n, t));
   }
-  // the assumption here is that the network just requires the ivector at time
-  // t=0.
-  ivector.indexes.push_back(Index(n, 0));
+
+  // most networks will just require the ivector at time t = 0,
+  // but this might not always be the case, and some might use rounding
+  // descriptors with the iVector which might require it at an earlier
+  // frame than the regular input, so we provide the iVector in as wide a range
+  // as it might possibly be needed.
+  for (int32 t = input_start - nnet.Modulus(); t < input_end; t++) {
+    ivector.indexes.push_back(Index(n, t));
+  }
+
 
   ComputationRequest request;
   request.inputs.push_back(input);
@@ -250,6 +257,22 @@ void ZeroComponentStats(Nnet *nnet) {
   }
 }
 
+void ScaleLearningRate(BaseFloat learning_rate_scale,
+                     Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      uc->SetActualLearningRate(uc->LearningRate() * learning_rate_scale);
+    }
+  }
+}
+
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -266,6 +289,63 @@ void SetLearningRate(BaseFloat learning_rate,
   }
 }
 
+void SetLearningRates(const Vector<BaseFloat> &learning_rates,
+                     Nnet *nnet) {
+  int32 i = 0;
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      KALDI_ASSERT(i < learning_rates.Dim());
+      uc->SetActualLearningRate(learning_rates(i++));
+    }
+  }
+  KALDI_ASSERT(i == learning_rates.Dim());
+}
+
+void GetLearningRates(const Nnet &nnet,
+                      Vector<BaseFloat> *learning_rates) {
+  learning_rates->Resize(NumUpdatableComponents(nnet));
+  int32 i = 0;
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      (*learning_rates)(i++) = uc->LearningRate();
+    }
+  }
+  KALDI_ASSERT(i == learning_rates->Dim());
+}
+
+void ScaleNnetComponents(const Vector<BaseFloat> &scale_factors,
+                         Nnet *nnet) {
+  int32 i = 0;
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      KALDI_ASSERT(i < scale_factors.Dim());
+      uc->Scale(scale_factors(i++));
+    }
+  }
+  KALDI_ASSERT(i == scale_factors.Dim());
+}
+
 void ScaleNnet(BaseFloat scale, Nnet *nnet) {
   if (scale == 1.0) return;
   else if (scale == 0.0) {
@@ -615,31 +695,6 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
       if (outputs_remaining == 0)
         KALDI_ERR << "All outputs were removed.";
       nnet->RemoveSomeNodes(nodes_to_remove);
-    } else if (directive == "set-dropout-proportion") {
-      std::string name_pattern = "*";
-      // name_pattern defaults to '*' if none is given.  This pattern
-      // matches names of components, not nodes.
-      config_line.GetValue("name", &name_pattern);
-      BaseFloat proportion = -1;
-      if (!config_line.GetValue("proportion", &proportion)) {
-        KALDI_ERR << "In edits-config, expected proportion to be set in line: "
-                  << config_line.WholeLine();
-      }
-      DropoutComponent *dropout_component = NULL;
-      int32 num_dropout_proportions_set = 0;
-      for (int32 c = 0; c < nnet->NumComponents(); c++) {
-        if (NameMatchesPattern(nnet->GetComponentName(c).c_str(),
-                               name_pattern.c_str()) &&
-            (dropout_component =
-             dynamic_cast<DropoutComponent*>(nnet->GetComponent(c)))) {
-          if (dropout_component != NULL) {
-            dropout_component->SetDropoutProportion(proportion);
-            num_dropout_proportions_set++;
-          }
-        }
-      }
-      KALDI_LOG << "Set dropout proportions for "
-                << num_dropout_proportions_set << " components.";
     } else {
       KALDI_ERR << "Directive '" << directive << "' is not currently "
           "supported (reading edit-config).";
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 0b5ab3c1fd4..55fcddd7f58 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -97,7 +97,7 @@ bool IsSimpleNnet(const Nnet &nnet);
 void ZeroComponentStats(Nnet *nnet);
 
 
-/// ComputeNnetContext computes the left-context and right-context of a nnet.
+/// ComputeSimpleNnetContext computes the left-context and right-context of a nnet.
 /// The nnet must satisfy IsSimpleNnet(nnet).
 ///
 /// It does this by constructing a ComputationRequest with a certain number of inputs
@@ -158,7 +158,7 @@ void ConvertRepeatedToBlockAffine(Nnet *nnet);
 /// Info() function (we need this in the CTC code).
 std::string NnetInfo(const Nnet &nnet);
 
-/// This function sets the dropout proportion in all dropout component to 
+/// This function sets the dropout proportion in all dropout component to
 /// dropout_proportion value.
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 

From b08940a6ea5fcc83a7da0450a3d36462e064785f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 7 Nov 2016 00:51:34 -0500
Subject: [PATCH 013/213] Going some way towards optimization for online
 decoding (identified pieces to splice).

---
 src/nnet3/nnet-optimize-utils.cc | 371 +++++++++++++++++++++++++++++--
 src/nnet3/nnet-optimize.cc       |  55 +++--
 src/nnet3/nnet-optimize.h        |  80 ++-----
 3 files changed, 408 insertions(+), 98 deletions(-)

diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 3b7dda18e96..d2d6daf2a47 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -1670,8 +1670,13 @@ class ComputationOnlineOptimizer {
   bool Optimize();
 
  private:
+
+  // Figures out the time shift between the successive computation requests.
+  static int32 FindTimeShift(const NnetComputation &computation,
+                             const std::vector<int32> &segment_ends);
+
   // This function creates a mapping from a matrix-index > 0,
-  // to a pair (time_offset, unique_id) that represents the debug-info
+  // to a pair (unique_id, time_offset) that represents the debug-info
   // for that matrix-id in computation.debug_info.
   // The output vector is indexed by the matrix-index in the computation (the
   // zeroth member is not valid).  It requires that the
@@ -1682,10 +1687,83 @@ class ComputationOnlineOptimizer {
   // value of the DebugInfo.  That is, if two 'cindexes' vectors differ only
   // by a time offset, and the 'is_deriv' values are the same they will map to the same
   // unique_id.
+  // The output 'matrix_to_pair' is indexed by matrix index (the zeroth element is
+  // not set).
   static void CreateMatrixPairs(const NnetComputation &computation,
                                 std::vector<std::pair<int32, int32> > *matrix_to_pair);
 
 
+  // This very simple helper function reverses the map 'matrix_to_pair' so we can
+  // do the reverse lookup.  It outputs a map from pair to matrix index m, where
+  // 1 <= m < matrix_to_pair.size().
+  static void GetPairToMatrixMap(
+      std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix);
+
+
+  // Given a vector of lists, one list for each segment, of the active matrices
+  // at the end of that segment, this function converts those lists into a
+  // different representation where each matrix is reprented as a pair instead
+  // of as a single int32.  'active_pairs' will have the same dimensions as
+  // 'active_matrices'.
+  static void ConvertListsToPairLists(
+      const std::vector<std::vector<int32> > &active_matrices,
+      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs);
+
+  // This function modifies the lists of active matrices per segment
+  // (represented as pairs) in 'active_pairs' by sorting them and
+  // then subtracting the time-offset of the first pair in each
+  // list ((*active_pair)[seg][0].second), from all elements in that list.
+  // It puts the subtracted offset in (*time_offsets)[seg].  This change
+  // of representation makes it easy to tell whether the sets of active
+  // matrices for different segments are identical up to a time-offset.
+  static void NormalizePairLists(
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
+      std::vector<int32> *time_offsets);
+
+  // This function looks in the matrix 'active_pairs' for the first pair of
+  // identical values, i.e. it is looking for i < j for which
+  // normalized_active_pairs[i] == normalized_active_pairs[j].  If there
+  // is such a pair it outputs them to *seg1 and *seg2, and returns true;
+  // otherwise it returns false.
+  //
+  // Update to the above: It turns out that under some circumstances, the
+  //  original function found repeats that were not "really" repeats (the
+  //  matrices were not time shifted) The situation was a bit obscure (it was a
+  //  non-recurrent setup with a lot of extra-right-context, where some inputs
+  //  were never used), but to prevent it happening again we are now checking
+  //  in addition to the above, that the time-shift between the segments
+  //  (i.e. time_offsets[j] - time_offsets[i]), has the "expected value"
+  //  based on the assumption that each segment should be shifted relative
+  //  to the previous segment, by 'time_shift_per_segment'.
+  static bool FindFirstRepeat(
+      const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
+      const std::vector<int32> &time_offsets,
+      int32 time_shift_per_segment,
+      int32 *seg1, int32 *seg2);
+
+  // Converts a list of pairs (e.g. one of the elements of the output of
+  // 'ConvertListsToPairLists)', back into a list of matrix indexes, using the
+  // map 'pair_to_matrix'.
+  static void PairListToMatrixList(
+      const std::vector<std::pair<int32, int32> > &pair_list,
+      const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
+      std::vector<int32> *matrix_list);
+
+
+  // This function just does some checking (via asserts), that
+  // the lists of matrices 'list1' and 'list2' are of the same length,
+  // that time_difference > 0, that each matrix with index m = list2[i] is of the
+  // same dimension as the list1[i], with Cindexes that are the same except for
+  // the time index being greater by 'time_difference'
+  static void CheckIdentifiedMatrices(
+      const NnetComputation &computation,
+      const std::vector<int32> &list1,
+      const std::vector<int32> &list2,
+      int32 time_difference);
+
+
   /// Given a list of command indexes ('segment_end_commands') which are
   /// expected to be command indexes of the kNoOperationMarker at segment
   /// boundaries, this function outputs for each of these command indexes a list
@@ -1697,8 +1775,8 @@ class ComputationOnlineOptimizer {
   /// same index as 'segment_end_commands', and is then a list of active
   /// matrices, in numerical order of matrix index.
   static void FindActiveMatrices(const NnetComputation &computation,
-                                 const std::vector<int32> &segment_end_commands,
                                  const Analyzer &analyzer,
+                                 const std::vector<int32> &segment_end_commands,
                                  std::vector<std::vector<int32> > *active_matrices);
 
 
@@ -1713,6 +1791,56 @@ class ComputationOnlineOptimizer {
 };
 
 
+// static
+int32 ComputationOnlineOptimizer::FindTimeShift(
+    const NnetComputation &computation,
+    const std::vector<int32> &segment_ends) {
+  KALDI_ASSERT(segment_ends.size() >= 3);
+  // Ignore the first segment as it tends to be a special case
+  // (it has more left context),
+  int32 second_segment_begin = segment_ends[0],
+      third_segment_begin = segment_ends[1],
+      fourth_segment_begin = segment_ends[2];
+  int32 first_output_command_seg2 = -1,
+      first_output_command_seg3 = -1;
+  for (int32 c = second_segment_begin; c < third_segment_begin; c++)
+    if (computation.commands[c].command_type == kProvideOutput &&
+        first_output_command_seg2 < 0)
+      first_output_command_seg2 = c;
+  for (int32 c = third_segment_begin; c < fourth_segment_begin; c++)
+    if (computation.commands[c].command_type == kProvideOutput &&
+        first_output_command_seg3 < 0)
+      first_output_command_seg3 = c;
+  if (first_output_command_seg2 < 0 ||
+      first_output_command_seg3 < 0)
+    KALDI_ERR << "Could not locate output commands for segments 2 and 3.";
+  const NnetComputation::Command
+      &command2 = computation.commands[first_output_command_seg2],
+      &command3 = computation.commands[first_output_command_seg3];
+  int32 seg2_node = command2.arg2, seg3_node = command3.arg2;
+  KALDI_ASSERT(seg2_node == seg3_node);
+  int32 seg2_submatrix = command2.arg1,
+      seg3_submatrix = command3.arg1;
+  KALDI_ASSERT(computation.IsWholeMatrix(seg2_submatrix) &&
+               computation.IsWholeMatrix(seg3_submatrix));
+  int32 seg2_matrix = computation.submatrices[seg2_submatrix].matrix_index,
+      seg3_matrix = computation.submatrices[seg3_submatrix].matrix_index;
+  KALDI_ASSERT(computation.matrices[seg2_matrix].num_rows ==
+               computation.matrices[seg3_matrix].num_rows);
+  KALDI_ASSERT(!computation.matrix_debug_info.empty());
+  const NnetComputation::MatrixDebugInfo
+      &debug_info2 = computation.matrix_debug_info[seg2_matrix],
+      &debug_info3 = computation.matrix_debug_info[seg3_matrix];
+  int32 t_offset = debug_info3.cindexes[0].second.t -
+      debug_info2.cindexes[0].second.t;
+  int32 num_rows = debug_info2.cindexes.size();
+  for (int32 r = 0; r < num_rows; r++) {
+    KALDI_ASSERT(debug_info3.cindexes[r].second.t ==
+                 debug_info2.cindexes[r].second.t + t_offset);
+  }
+  return t_offset;
+}
+
 // static
 void ComputationOnlineOptimizer::CreateMatrixPairs(
     const NnetComputation &computation,
@@ -1744,27 +1872,135 @@ void ComputationOnlineOptimizer::CreateMatrixPairs(
     }
     bool is_deriv = computation.matrix_debug_info[m].is_deriv;
     int32 unique_id = 2 * vector_id + (is_deriv ? 1 : 0);
-    (*matrix_to_pair)[m].first = t_offset;
-    (*matrix_to_pair)[m].second = unique_id;
+    (*matrix_to_pair)[m].first = unique_id;
+    (*matrix_to_pair)[m].second = t_offset;
+  }
+}
+
+// static
+void ComputationOnlineOptimizer::GetPairToMatrixMap(
+      std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix) {
+  int32 num_matrices = matrix_to_pair.size();
+  // actually there are one fewer matrices than num_matrices.
+  pair_to_matrix->clear();
+  for (int32 m = 1; m < num_matrices; m++)
+    (*pair_to_matrix)[matrix_to_pair[m]] = m;
+}
+
+
+// static
+void ComputationOnlineOptimizer::ConvertListsToPairLists(
+      const std::vector<std::vector<int32> > &active_matrices,
+      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs) {
+  active_pairs->clear();
+  active_pairs->resize(active_matrices.size());
+  int32 num_matrices = matrix_to_pair.size();
+  for (size_t seg = 0; seg < active_matrices.size(); seg++) {
+    const std::vector<int32> &this_active_matrix_list = active_matrices[seg];
+    std::vector<std::pair<int32, int32> > &this_active_pair_list =
+        (*active_pairs)[seg];
+    this_active_pair_list.resize(this_active_matrix_list.size());
+    std::vector<int32>::const_iterator iter = this_active_matrix_list.begin(),
+        end = this_active_matrix_list.end();
+    std::vector<std::pair<int32, int32> >::iterator
+        out_iter = this_active_pair_list.begin();
+    for (; iter != end; ++iter, ++out_iter) {
+      KALDI_ASSERT(*iter > 0 && *iter < num_matrices);
+      *out_iter = matrix_to_pair[*iter];
+    }
+  }
+}
+
+// static
+void ComputationOnlineOptimizer::NormalizePairLists(
+    std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
+    std::vector<int32> *time_offsets) {
+  int32 num_segments = active_pairs->size();
+  time_offsets->resize(num_segments);
+  for (int32 seg = 0; seg < num_segments; seg++) {
+    std::vector<std::pair<int32, int32> > &this_pairs = (*active_pairs)[seg];
+    std::sort(this_pairs.begin(), this_pairs.end());
+    int32 this_offset;
+    if (!this_pairs.empty()) {
+      this_offset = this_pairs[0].second;
+    } else {
+      // if this_pairs is empty, produce arbitrary offsets that are increasing
+      // (this will keep some self-testing code happy).
+      if (seg == 0) { this_offset = 0; }
+      else { this_offset = (*time_offsets)[seg - 1] + 1; }
+    }
+    (*time_offsets)[seg] = this_offset;
+    std::vector<std::pair<int32, int32> >::iterator
+        iter = this_pairs.begin(), end = this_pairs.end();
+    for (; iter != end; ++iter)
+      iter->second -= this_offset;
+  }
+}
+
+
+// static
+bool ComputationOnlineOptimizer::FindFirstRepeat(
+    const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
+    const std::vector<int32> &time_offsets,
+    int32 time_shift_per_segment,
+    int32 *seg1, int32 *seg2) {
+  int32 num_segments = normalized_active_pairs.size();
+  // This algorithm may seem like it would be very slow, but the number of
+  // segments will normally be quite small (e.g. 10), and the comparison of
+  // elements of 'normalized_active_pairs' should be fast in cases where they
+  // differ.
+  for (int32 s = 0; s < num_segments; s++) {
+    for (int32 t = s + 1; t < num_segments; t++) {
+      if (time_offsets[t] - time_offsets[s] == (t - s) * time_shift_per_segment
+          && normalized_active_pairs[s] == normalized_active_pairs[t]) {
+        *seg1 = s;
+        *seg2 = t;
+        return true;
+      }
+    }
   }
+  return false;
 }
 
+// static
+void ComputationOnlineOptimizer::PairListToMatrixList(
+    const std::vector<std::pair<int32, int32> > &pair_list,
+    const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
+    std::vector<int32> *matrix_list) {
+  matrix_list->resize(pair_list.size());
+  std::vector<std::pair<int32, int32> >::const_iterator
+      iter = pair_list.begin(), end = pair_list.end();
+  std::vector<int32>::iterator out_iter = matrix_list->begin();
+  for (; iter != end; ++iter, ++out_iter) {
+    unordered_map<std::pair<int32, int32>, int32,
+                  PairHasher<int32> >::const_iterator
+        map_iter = pair_to_matrix.find(*iter);
+    if (map_iter == pair_to_matrix.end()) {
+      KALDI_ERR << "Could not find pair in map (code error)";
+    }
+    *out_iter = map_iter->second;
+  }
+}
+
+
 
 // static
 void ComputationOnlineOptimizer::FindActiveMatrices(
     const NnetComputation &computation,
-    const std::vector<int32> &segment_end_commands,
     const Analyzer &analyzer,
+    const std::vector<int32> &segment_end_commands,
     std::vector<std::vector<int32> > *active_matrices) {
   int32 num_matrices = computation.matrices.size();
   int32 num_segments = segment_end_commands.size();
   active_matrices->clear();
   active_matrices->resize(num_segments);
-  // this object just makes available some extra functions.
+  // this object just makes available some extra functions, vs. the Analyzer
+  // object.
   ComputationAnalysis analysis(computation, analyzer);
-  for (int32 s = 0; s + 1 < num_segments; s++) {
-    KALDI_ASSERT(segment_end_commands[s] < segment_end_commands[s+1]);
-  }
+  KALDI_ASSERT(IsSortedAndUniq(segment_end_commands));
+
   // the following vector gives us, for each matrix index, a submatrix index
   // that covers the whole of that matrix (needed by interface of 'analysis' object).
   std::vector<int32> whole_submatrices;
@@ -1772,31 +2008,130 @@ void ComputationOnlineOptimizer::FindActiveMatrices(
   for (int32 m = 1; m < num_matrices; m++) {
     // the following are command indexes, comparable with the indexes
     // in 'segment_end_commands'.
-    int32 s = whole_submatrices[m];  // submatrix consisting of the whole of
+    int32 s = whole_submatrices[m],  // submatrix consisting of the whole of
                                      // 'm'.
-    int32 first_access = analysis.FirstAccess(s),
+        first_access = analysis.FirstAccess(s),
         last_access = analysis.LastAccess(s);
-    std::vector<int32>::const_iterator iter = segment_end_commands.begin(),
-        end = segment_end_commands.end();
-    for (; iter != end; ++iter) {
-      int32 segment_end = *iter;
+    for (int32 seg = 0; seg < num_segments; seg++) {
+      int32 segment_end = segment_end_commands[seg];
       if (first_access < segment_end && last_access > segment_end) {
-        // TODO.
+        // If the block of time during which the matrix is accessed, includes
+        // this segment end-point, then the matrix is considered 'active' at
+        // that time.
+        (*active_matrices)[seg].push_back(m);
       }
     }
   }
+}
 
+// static
+void ComputationOnlineOptimizer::CheckIdentifiedMatrices(
+    const NnetComputation &computation,
+    const std::vector<int32> &list1,
+    const std::vector<int32> &list2,
+    int32 time_difference) {
+  KALDI_ASSERT(time_difference > 0);
+  KALDI_ASSERT(list1.size() == list2.size());
+  KALDI_ASSERT(!computation.matrix_debug_info.empty());
+  for (size_t i = 0; i < list1.size(); i++) {
+    int32 m1 = list1[i], m2 = list2[i];
+    const NnetComputation::MatrixInfo
+        &matrix_info1 = computation.matrices[m1],
+        &matrix_info2 = computation.matrices[m2];
+    KALDI_ASSERT(matrix_info1.num_rows == matrix_info2.num_rows &&
+                 matrix_info1.num_cols == matrix_info2.num_cols &&
+                 matrix_info1.stride_type == matrix_info2.stride_type);
+    const NnetComputation::MatrixDebugInfo
+        &debug_info1 = computation.matrix_debug_info[m1],
+        &debug_info2 = computation.matrix_debug_info[m2];
+    KALDI_ASSERT(debug_info1.is_deriv == debug_info2.is_deriv);
+    KALDI_ASSERT(debug_info1.cindexes.size() == debug_info2.cindexes.size());
+    std::vector<Cindex>::const_iterator iter1 = debug_info1.cindexes.begin(),
+        end1 = debug_info1.cindexes.end(),
+        iter2 = debug_info2.cindexes.begin();
+    for (; iter1 != end1; iter1++,iter2++) {
+      KALDI_ASSERT(iter2->first == iter1->first &&
+                   iter2->second.n == iter1->second.n &&
+                   iter2->second.t == iter1->second.t + time_difference &&
+                   iter2->second.x == iter1->second.x);
+    }
+  }
 }
 
+
 bool ComputationOnlineOptimizer::Optimize() {
   analyzer_.Init(nnet_, *computation_);
   KALDI_ASSERT(!computation_->matrix_debug_info.empty() &&
                "You must request matrix debug info when compiling "
                "online computations.");
 
-  // TODO.
+  // get the indexes of the separator commands at the ends of segments.
+  std::vector<int32> segment_ends;
+  GetSegmentEnds(*computation_, &segment_ends);
+  int32 time_shift_per_segment = FindTimeShift(*computation_,
+                                               segment_ends);
+
+  // Ignore the end of the very last segment- it is not a candidate for a
+  // 'splice point'.  What we're doing here is like creating a tape loop; we
+  // have to find a place where the list of variables is the same except for a
+  // time offset.
+  // [note: it's not exactly like a tape loop because the prologue can
+  // vary... the sequence is of the form like a b b b b b .. ]
+  segment_ends.pop_back();
+
+
+  std::vector<std::vector<int32> > active_matrices;
+  // Find the list of matrices active at each of those segment-end-command
+  // times.
+  FindActiveMatrices(*computation_, analyzer_, segment_ends,
+                     &active_matrices);
+
+  // Find a representation of the matrices of the computation as pairs
+  // (unique_id, time_offset) that are more amenable to finding
+  // matrices that represet lists of Cindexes that differ only by
+  // a time offset.
+  std::vector<std::pair<int32, int32> > matrix_to_pair;
+  CreateMatrixPairs(*computation_, &matrix_to_pair);
+
+  // Create the reverse map from pair to matrix index; we'll need it.
+  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > pair_to_matrix;
+  GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix);
+
+  // get lists of matrix per segment in the pair representation.
+  std::vector<std::vector<std::pair<int32, int32> > > pair_lists;
+  ConvertListsToPairLists(active_matrices, matrix_to_pair,
+                          &pair_lists);
+
+  std::vector<int32> time_offsets;
+  NormalizePairLists(&pair_lists, &time_offsets);
+
+  int32 seg1, seg2;
+
+  if (!FindFirstRepeat(pair_lists,
+                       time_offsets,
+                       time_shift_per_segment,
+                       &seg1, &seg2)) {
+    KALDI_VLOG(2) << "Could not find repeats of variables.";
+    return false;
+  }
 
-  return false;
+  // reverse the normalization for segments seg1 and seg2.
+  for (size_t i = 0; i < pair_lists[seg1].size(); i++)
+    pair_lists[seg1][i].second += time_offsets[seg1];
+  for (size_t i = 0; i < pair_lists[seg2].size(); i++)
+    pair_lists[seg2][i].second += time_offsets[seg2];
+  std::vector<int32> seg1_matrices, seg2_matrices;
+  PairListToMatrixList(pair_lists[seg1], pair_to_matrix, &seg1_matrices);
+  PairListToMatrixList(pair_lists[seg2], pair_to_matrix, &seg2_matrices);
+
+  int32 time_difference = time_offsets[seg2] - time_offsets[seg1];
+  CheckIdentifiedMatrices(*computation_, seg1_matrices, seg2_matrices,
+                          time_difference);
+
+  // HERE, do whatever kind of identification we have to do between the two
+  // lists of matrices.
+
+  return true;
 }
 
 
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 4cb18d26ea4..602aa1b2f86 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -418,9 +418,6 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
               NnetComputation *computation) {
-  if (!config.optimize)
-    return;
-
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, *computation, true);
 
@@ -438,42 +435,52 @@ void Optimize(const NnetOptimizeOptions &config,
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, *computation, true);
 
-  if (config.consolidate_model_update)
+  if (config.optimize && config.consolidate_model_update)
     ConsolidateModelUpdate(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, *computation, true);
 
-  if (config.convert_addition)
+  if (config.optimize && config.convert_addition) {
     ConvertAdditionToAssignment(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, true);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, true);
-
-  if (config.remove_assignments || config.backprop_in_place ||
-      config.propagate_in_place)
+  if (config.optimize &&
+      (config.remove_assignments || config.backprop_in_place ||
+       config.propagate_in_place)) {
     VariableMergingOptimization(config, nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, false);
-
-  if (config.initialize_undefined)
+  if (config.optimize && config.initialize_undefined) {
     RemoveUnnecessaryZeroing(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, false);
-
-  if (config.move_sizing_commands)
+  if (config.optimize && config.move_sizing_commands) {
     MoveSizingCommands(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, false);
+  // the online computation optimization has to go before
+  // 'RemoveUnnecessaryAllocation()'.  We don't gate this by 'config.optimize'
+  // because it's necessary for online computation to run.
+  if (config.optimize_online_computation){
+    OptimizeOnlineComputation(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (config.allocate_from_other)
+  if (config.optimize && config.allocate_from_other) {
     RemoveUnnecessaryAllocation(nnet, computation);
-
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, false);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
   // The following is not configurable because it is necessary for
   // the computation to run correctly (we do it after compilation too,
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index ce9e4de240a..303b08a4150 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -29,7 +29,7 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Options class for optimizing a NnetComputation.  The main projected use for
+// Options class for optimizing a NnetComputation The main projected use for
 // this is in debugging the optimization code itself, so that if an error is
 // detected, we can work out which optimization was responsible for the error.
 struct NnetOptimizeOptions {
@@ -46,23 +46,25 @@ struct NnetOptimizeOptions {
   bool allocate_from_other;
   int32 min_deriv_time;
   int32 max_deriv_time;
-  int32 max_deriv_time_relative;
-
-  NnetOptimizeOptions():
-      optimize(true),
-      consolidate_model_update(true),
-      propagate_in_place(true),
-      backprop_in_place(true),
-      convert_addition(true),
-      remove_assignments(true),
-      allow_left_merge(true),
-      allow_right_merge(true),
-      initialize_undefined(true),
-      move_sizing_commands(true),
-      allocate_from_other(true),
-      min_deriv_time(std::numeric_limits<int32>::min()),
-      max_deriv_time(std::numeric_limits<int32>::max()),
-      max_deriv_time_relative(std::numeric_limits<int32>::max()) {}
+  // optimize_online_computation is a 'hidden config' not available from
+  // the command line; it's set to true to enable the optimization for
+  // online computation that turns a linear computation into a loop.
+  bool optimize_online_computation;
+
+  NnetOptimizeOptions(): optimize(true),
+                         consolidate_model_update(true),
+                         propagate_in_place(true),
+                         backprop_in_place(true),
+                         convert_addition(true),
+                         remove_assignments(true),
+                         allow_left_merge(true),
+                         allow_right_merge(true),
+                         initialize_undefined(true),
+                         move_sizing_commands(true),
+                         allocate_from_other(true),
+                         min_deriv_time(std::numeric_limits<int32>::min()),
+                         max_deriv_time(std::numeric_limits<int32>::max()),
+                         optimize_online_computation(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "
@@ -102,12 +104,6 @@ struct NnetOptimizeOptions {
                    "the maximum t value that you want derivatives to be computed "
                    "at when updating the model.  This is an optimization that "
                    "saves time in the backprop phase for recurrent frameworks");
-    opts->Register("max-deriv-time-relative", &max_deriv_time_relative,
-                   "An alternative mechanism for setting the --max-deriv-time, "
-                   "suitable for situations where the length of the egs is "
-                   "variable.  If set, it is equivalent to setting the "
-                   "--max-deriv-time to this value plus the largest 't' value "
-                   "in any 'output' node of the computation request.");
   }
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &os, bool binary) const;
@@ -138,47 +134,20 @@ struct ComputationRequestPtrEqual {
   }
 };
 
-
-
-struct CachingOptimizingCompilerOptions {
-  bool use_shortcut;
-  int32 write_cache;
-  int32 cache_capacity;
-
-
-
-  CachingOptimizingCompilerOptions():
-      use_shortcut(true),
-      cache_capacity(64) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("use-shortcut", &use_shortcut,
-                   "If true, use the 'shortcut' in compilation whereby "
-                   "computation requests with regular structure are identified "
-                   "as such, a computation with a smaller number of distinct "
-                   "values of 'n' is compiled (e.g. 2), and the compiled "
-                   "computation is expanded to match the size of the real "
-                   "computation request.");
-    opts->Register("cache-capacity", &cache_capacity,
-                   "Determines how many computations the computation-cache will "
-                   "store (most-recently-used).");
-  }
-};
-
 /// This class enables you to do the compilation and optimization in one call,
 /// and also ensures that if the ComputationRequest is identical to the previous
 /// one, the compilation process is not repeated.
 class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
-                            const CachingOptimizingCompilerOptions &config):
-      nnet_(nnet), config_(config), cache_capacity_(capacity) { }
+                           const int32 capacity = 20):
+      nnet_(nnet), cache_capacity_(capacity) { }
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
-                            const CachingOptimizingCompilerOptions &config):
-      nnet_(nnet), config_(config), opt_config_(opt_config) { }
+                            const int32 capacity = 20):
+      nnet_(nnet), opt_config_(opt_config), cache_capacity_(capacity) { }
 
   ~CachingOptimizingCompiler();
   /// Does the compilation and returns a const pointer to
@@ -190,7 +159,6 @@ class CachingOptimizingCompiler {
   void WriteCache(std::ostream &os, bool binary) const;
  private:
   const Nnet &nnet_;
-  CachingOptimizingCompilerOptions config_;
   NnetOptimizeOptions opt_config_;
 
   // The access queue for keeping track of the freshness of computation.

From 57a4af9c04432d3c4f4bc7d7d55abf673c4df9d4 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 10 Nov 2016 21:19:46 -0500
Subject: [PATCH 014/213] Get the online optimization code working to the point
 where the tests run.

---
 src/nnet3/nnet-analyze.cc        | 132 ++++++++++--------
 src/nnet3/nnet-analyze.h         |  21 ++-
 src/nnet3/nnet-compile-test.cc   |   2 +-
 src/nnet3/nnet-computation.cc    |  16 +++
 src/nnet3/nnet-computation.h     |   8 +-
 src/nnet3/nnet-optimize-utils.cc | 226 ++++++++++++++++++++++++++++---
 src/nnet3/nnet-optimize-utils.h  |  14 +-
 src/nnet3/nnet-optimize.cc       |  40 +++++-
 8 files changed, 371 insertions(+), 88 deletions(-)

diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 5513ce24a92..956c933d417 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -212,7 +212,7 @@ std::string ComputationVariables::DescribeVariable(int32 variable) const {
       num_column_variables = column_split_points_[matrix_index].size() - 1,
       num_row_variables = row_split_points_[matrix_index].size() - 1,
       column_variable = offset % num_column_variables,
-      row_variable = offset / num_row_variables;
+      row_variable = offset / num_column_variables;
   KALDI_ASSERT(column_variable >= 0 && row_variable >= 0 &&
                row_variable < num_row_variables &&
                column_variable < num_column_variables);
@@ -381,6 +381,8 @@ void ComputeCommandAttributes(
       }
       case kNoOperation:
       case kNoOperationMarker:
+      case kNoOperationLabel:
+      case kGotoLabel:
         break;
       default:
         KALDI_ERR << "Unknown command type.";
@@ -558,7 +560,6 @@ ComputationChecker::ComputationChecker(
 void ComputationChecker::Check() {
   CheckComputationIndexes();
   a_.Init(nnet_, computation_);
-  CheckComputationOrder();
   CheckComputationMatrixAccesses();
   CheckComputationUndefined();
   CheckComputationDebugInfo();
@@ -580,8 +581,12 @@ void ComputationChecker::CheckComputationRewrite() const {
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
     if (accesses.empty()) {
-      KALDI_ERR << "Variable " << v << " = " << a_.variables.DescribeVariable(v)
-                << "is never used.";
+      if (config_.check_unused_variables) {
+        KALDI_ERR << "Variable " << v << " = " << a_.variables.DescribeVariable(v)
+                  << " is never used.";
+      } else {
+        continue;
+      }
     }
     int32 num_accesses = accesses.size();
     int32 first_pure_read = -1;
@@ -597,8 +602,8 @@ void ComputationChecker::CheckComputationRewrite() const {
         if (accesses[access].access_type != kReadAccess) {
           KALDI_ERR << "Variable " << v << " = "
                     << a_.variables.DescribeVariable(v)
-                    << "is modified after being read "
-                    << "(this is not expected before optimization)";
+                    << " is modified after being read"
+                    << " (this is not expected before optimization)";
         }
       }
     }
@@ -613,13 +618,17 @@ void ComputationChecker::CheckComputationUndefined() const {
   int32 num_variables = a_.variable_accesses.size();
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
-    if (accesses.empty())
-      KALDI_ERR << "Variable " << v << " == "
-                << a_.variables.DescribeVariable(v) << "is never used.";
-    if (accesses[0].access_type != kWriteAccess)
-      KALDI_ERR << "Variable " << v << " == "
-                << a_.variables.DescribeVariable(v)
-                << " is read before it is written to";
+    if (accesses.empty()) {
+      if (config_.check_unused_variables) {
+        KALDI_ERR << "Variable " << v << " == "
+                  << a_.variables.DescribeVariable(v) << "is never used.";
+      }
+    } else {
+      if (accesses[0].access_type != kWriteAccess)
+        KALDI_ERR << "Variable " << v << " == "
+                  << a_.variables.DescribeVariable(v)
+                  << " is read before it is written to";
+    }
   }
 }
 
@@ -637,7 +646,7 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
   for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
     const MatrixAccesses &accesses = a_.matrix_accesses[matrix_index];
     if (accesses.allocate_command == -1)
-      KALDI_ERR << "Matrix m" << matrix_index << "is not initialized.";
+      KALDI_ERR << "Matrix m" << matrix_index << " is not initialized.";
     if (accesses.accesses.empty()) {
       KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
     } else if (accesses.accesses.front().command_index <
@@ -917,49 +926,24 @@ void ComputationChecker::CheckComputationIndexes() const {
       }
       case kNoOperation:
       case kNoOperationMarker:
+      case kNoOperationLabel:
+        break;
+      case kGotoLabel: {
+        int32 label_index = c.arg1;
+        if (label_index < 0 || label_index >= command_index ||
+            computation_.commands[label_index].command_type != kNoOperationLabel)
+          KALDI_ERR << "kGotoLabel command has invalid destination index.";
         break;
+        if (command_index + 1 != num_commands) {
+          KALDI_ERR << "kGotoLabel is not the last command in the computation";
+        }
+      }
       default:
         KALDI_ERR << "Unknown command type.";
     }
   }
 }
 
-
-// make sure Propagate comes before kNoOperationMarker and Backprop comes after
-// it, and that the value of computation_computation_end matches the position of
-// kNoOpMarker.
-void ComputationChecker::CheckComputationOrder() const {
-  int32 num_commands = computation_.commands.size();
-  int32 num_markers = 0, marker_location = 0;
-  for (int32 c = 0; c < num_commands; c++) {
-    if (computation_.commands[c].command_type ==
-        kNoOperationMarker) {
-      marker_location = c;
-      num_markers++;
-    }
-  }
-  if (num_markers != 1)
-    KALDI_ERR << "Expected exactly one kNoOperationMarker marker.";
-
-  for (int32 c = 0; c < num_commands; c++) {
-    CommandType command_type =
-        computation_.commands[c].command_type;
-    if (c != marker_location &&
-        command_type == kNoOperationMarker)
-      KALDI_ERR << "Found kNoOpMarker in unexpected place";
-    if (c < marker_location &&
-        (command_type == kBackprop ||
-         command_type == kBackpropNoModelUpdate))
-      KALDI_ERR << "Backprop occurs before kNoOpMarker";
-    if (c > marker_location &&
-        command_type == kPropagate)
-      KALDI_ERR << "Propagate occurs after kNoOpMarker";
-    if (c > marker_location &&
-        command_type == kStoreStats)
-      KALDI_ERR << "StoreStats occurs after kNoOpMarker";
-  }
-}
-
 void ComputationChecker::CheckComputationDebugInfo() const {
   if (computation_.matrix_debug_info.empty()) return;
   if (computation_.matrix_debug_info.size() !=
@@ -981,15 +965,57 @@ void ComputationChecker::CheckComputationDebugInfo() const {
   }
 }
 
-void CheckComputation(const Nnet &nnet,
-                      const NnetComputation &computation,
-                      bool check_rewrite) {
+
+// note: 'computation' is not a reference, it's copied so that we
+// can modify it internally.
+static void CheckComputationOnline(const Nnet &nnet,
+                                   NnetComputation computation,
+                                   bool check_rewrite) {
+  int32 num_commands = computation.commands.size();
+  KALDI_ASSERT(computation.commands[num_commands-1].command_type == kGotoLabel);
+  for (int32 c = num_commands - 2;
+       c >= 0 && computation.commands[c].command_type == kAllocMatrixFromOther;
+       c--) {
+    // this command can be interpreted as "initialize matrix referred to by
+    // c.arg2 with the matrix referred to by c.arg2".
+    // Because this would be interpreted by the analysis code as initializing a
+    // matrix that has already been initialized, we turn this into a command
+    // that just deallocates the matrix in c.arg2. [note: all these indexes
+    // are actually submatrix indexes].
+    computation.commands[c].command_type = kDeallocMatrix;
+    std::swap(computation.commands[c].arg1, computation.commands[c].arg2);
+  }
+
   CheckComputationOptions opts;
   opts.check_rewrite = check_rewrite;
+  opts.check_unused_variables = false;
+  // We can always do this check with online computations, since they do not
+  // have the RemoveUnnecessaryAllocation() optimization applied.
   ComputationChecker checker(opts, nnet, computation);
   checker.Check();
 }
 
+void CheckComputation(const Nnet &nnet,
+                      const NnetComputation &computation,
+                      bool check_rewrite) {
+  try {
+    if (!computation.commands.empty() &&
+        computation.commands.back().command_type == kGotoLabel) {
+      // Online computations need to be treated specially.
+      CheckComputationOnline(nnet, computation, check_rewrite);
+    } else {
+      CheckComputationOptions opts;
+      opts.check_rewrite = check_rewrite;
+      ComputationChecker checker(opts, nnet, computation);
+      checker.Check();
+    }
+  } catch (...) {
+    computation.Print(std::cerr, nnet);
+    KALDI_ERR << "Computation check failed for computation printed above "
+        "(actual error message is above computation)";
+  }
+}
+
 void ComputeMatrixToSubmatrix(
     const NnetComputation &computation,
     std::vector<std::vector<int32> > *mat_to_submat) {
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 7109575e415..4a827c05eb0 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -381,11 +381,22 @@ struct CheckComputationOptions {
   // do the check_rewrite check only for a non-optimized computation, it may
   // legitimately fail after optimization.  see code for details.
   bool check_rewrite;
-
-  CheckComputationOptions(): check_rewrite(false) { }
+  // If 'check_unused_variables' is true, it checks for unused variables
+  // (e.g. unused partsof matrices).  We only set it false for online
+  // computations, where there can be instances where a part of a matrix is
+  // apparently never accessed (until we consider that the matrix is swapped
+  // with another).
+  bool check_unused_variables;
+
+  CheckComputationOptions():
+      check_rewrite(false), check_unused_variables(true) { }
 };
 
 
+// Note: this checker class does not work for online computations (that have a
+// kGoto statement), but the function CheckComputation() is able to detect such
+// computations and modify them in such a way that they can be checked by this
+// class (and then do extra checks).
 class ComputationChecker {
  public:
   ComputationChecker(const CheckComputationOptions &config,
@@ -395,10 +406,6 @@ class ComputationChecker {
  private:
   // various dimension consistency checks and checks on properties.
   void CheckComputationIndexes() const;
-  // make sure Propagate comes before kNoOpMarker and Backprop comes after it,
-  // and that the value of forward_computation_end matches the position of
-  // kNoOpMarker.
-  void CheckComputationOrder() const;
   // checks for a situation where an undefined variable is read.
   void CheckComputationUndefined() const;
   // checks that all writes are done before reads.  details with implementation.
@@ -426,6 +433,8 @@ void GetSegmentEnds(const NnetComputation &computation,
 
 /// This is a convenience interface for class ComputationChecker.  Call it with
 /// check_rewrite = true only if the computation is pre-optimization.
+/// If the computation is an 'online' computation, this function treats
+/// it specially.
 void CheckComputation(const Nnet &nnet,
                       const NnetComputation &computation,
                       bool check_rewrite = false);
diff --git a/src/nnet3/nnet-compile-test.cc b/src/nnet3/nnet-compile-test.cc
index da08253093a..eaff78ad4c6 100644
--- a/src/nnet3/nnet-compile-test.cc
+++ b/src/nnet3/nnet-compile-test.cc
@@ -177,7 +177,7 @@ void UnitTestNnetCompileOnline() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  SetVerboseLevel(2);
+  SetVerboseLevel(4);
 
   UnitTestNnetCompileOnline();
   UnitTestNnetCompile();
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 21ab88f5f12..046d8c824e3 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -285,6 +285,10 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) {
       command_type = kNoOperation;
     } else if (command_type_str == "kNoOperationMarker") {
       command_type = kNoOperationMarker;
+    } else if (command_type_str == "kNoOperationLabel") {
+      command_type = kNoOperationLabel;
+    } else if (command_type_str == "kGotoLabel") {
+      command_type = kGotoLabel;
     } else {
       KALDI_ERR << "Un-handled command type.";
     }
@@ -378,6 +382,12 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
       case kNoOperationMarker:
         os << "kNoOperationMarker\n";
         break;
+      case kNoOperationLabel:
+        os << "kNoOperationLabel\n";
+        break;
+      case kGotoLabel:
+        os << "kGotoLabel\n";
+        break;
       default:
         KALDI_ERR << "Un-handled command type.";
     }
@@ -608,6 +618,12 @@ static void PrintCommand(std::ostream &os,
     case kNoOperationMarker:
       os << "# computation segment separator [e.g., begin backward commands]\n";
       break;
+    case kNoOperationLabel:
+      os << "[label for goto statement]\n";
+      break;
+    case kGotoLabel:
+      os << "goto c" << c.arg1 << "\n";
+      break;
     default:
       KALDI_ERR << "Un-handled command type.";
   }
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index ba0eaada1a0..857dde1547b 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -221,6 +221,12 @@ struct ComputationRequest {
    - kNoOperation: does nothing (sometimes useful during optimization)
    - kNoOperationMarker: does nothing, but used to mark end of a block
      of commands (like forward commands).
+   - kNoOperationLabel: does nothing, but is the destination for
+     the kGotoLabel command.
+   - kGotoLabel: jumps to the kNoOperationLabel command.  arg1 must
+     be set to the location of that command.  Since there are no
+     conditionals, this should be the last command, as remaining
+     commands will be unreachable.
 
 */
 enum CommandType {
@@ -230,7 +236,7 @@ enum CommandType {
   kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
   kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
   kAddRowRanges, kAcceptInput, kProvideOutput,
-  kNoOperation, kNoOperationMarker };
+  kNoOperation, kNoOperationMarker, kNoOperationLabel, kGotoLabel };
 
 
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index d2d6daf2a47..7a0fafb0b5e 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -73,6 +73,8 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
       break;
     case kNoOperation:
     case kNoOperationMarker:
+    case kNoOperationLabel:
+    case kGotoLabel:
       break;
     default:
       KALDI_ERR << "Unknown command type.";
@@ -1724,19 +1726,21 @@ class ComputationOnlineOptimizer {
 
   // This function looks in the matrix 'active_pairs' for the first pair of
   // identical values, i.e. it is looking for i < j for which
-  // normalized_active_pairs[i] == normalized_active_pairs[j].  If there
-  // is such a pair it outputs them to *seg1 and *seg2, and returns true;
-  // otherwise it returns false.
+  // normalized_active_pairs[i] == normalized_active_pairs[j].  (However, the
+  // pair i,j must satisfy an extra condition, see below).  If a pair
+  // i,j exists satisfying these conditions, this function outputs them to *seg1
+  // and *seg2, and returns true; otherwise it returns false.
   //
-  // Update to the above: It turns out that under some circumstances, the
-  //  original function found repeats that were not "really" repeats (the
-  //  matrices were not time shifted) The situation was a bit obscure (it was a
-  //  non-recurrent setup with a lot of extra-right-context, where some inputs
-  //  were never used), but to prevent it happening again we are now checking
-  //  in addition to the above, that the time-shift between the segments
-  //  (i.e. time_offsets[j] - time_offsets[i]), has the "expected value"
-  //  based on the assumption that each segment should be shifted relative
-  //  to the previous segment, by 'time_shift_per_segment'.
+  // Extra condition:
+  // It turns out that under some circumstances, we can
+  // fine repeats that were not "really" repeats (the matrices were not time
+  // shifted) The situation was a bit obscure (it was a non-recurrent setup with
+  // a lot of extra-right-context, where some inputs were never used), but to
+  // prevent it happening again we are now checking in addition to the above,
+  // that the time-shift between the segments (i.e. time_offsets[j] -
+  // time_offsets[i]), has the "expected value" based on the assumption that
+  // each segment should be shifted relative to the previous segment, by
+  // 'time_shift_per_segment'.
   static bool FindFirstRepeat(
       const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
       const std::vector<int32> &time_offsets,
@@ -1764,6 +1768,43 @@ class ComputationOnlineOptimizer {
       int32 time_difference);
 
 
+  // Given two command indexes command1 < command2 pointing to commands of type
+  // kNoOperationMarker, this function modifies the computation by
+  // removing all commands after command2, replacing command2 with a kGotoLabel
+  // command pointing to command1  and then inserting just before command1
+  // a marker of type kNoOperationLabel.
+  static void FormInfiniteLoop(int32 command1, int32 command2,
+                               NnetComputation *computation);
+
+  // This is to be called after FormInfiniteLoop.  It inserts, just before
+  // the final kGotoLabel command, commands that initialize
+  // each of the matrices in list 'matrices1' from the corresponding
+  // matrix in 'matrices2', using the kAllocMatrixFromOther command.
+  // This effectively does, for example, matrices1[i] = matrices2[i],
+  // while initializing matrices1[i] and deallocating matrices2[i];
+  // it's implemented as a shallow swap.
+  // It does this in such an order that even if the two lists are
+  // not disjoint, the right thing happens.
+  static void AddMatrixSwapCommands(
+      const std::vector<int32> &matrices1,
+      const std::vector<int32> &matrices2,
+      NnetComputation *computation);
+
+
+  // Called from AddMatrixSwapCommands, this function figures out for us
+  // an acceptable order in which to execute the kAllocMatrixFromOther
+  // commands.  This is easy to do if matrices1 and matrices2 are disjoint
+  // sets, but has to be done more carefully if they overlap.
+  // The output is a list of pairs where each pair (a, b) comes from
+  // from matrices1 and matrices2 in the same position, i.e.
+  // a = matrices1[i] and b = matrices2[i].
+  static void GetMatrixSwapOrder(
+      const std::vector<int32> &matrices1,
+      const std::vector<int32> &matrices2,
+      std::vector<std::pair<int32, int32> > *swaps);
+
+
+
   /// Given a list of command indexes ('segment_end_commands') which are
   /// expected to be command indexes of the kNoOperationMarker at segment
   /// boundaries, this function outputs for each of these command indexes a list
@@ -1774,6 +1815,7 @@ class ComputationOnlineOptimizer {
   /// at those points in time.  '*active_matrices' is indexed by the
   /// same index as 'segment_end_commands', and is then a list of active
   /// matrices, in numerical order of matrix index.
+  /// Note: for each i, (*active_matrices)[i] will be sorted and unique.
   static void FindActiveMatrices(const NnetComputation &computation,
                                  const Analyzer &analyzer,
                                  const std::vector<int32> &segment_end_commands,
@@ -1951,10 +1993,22 @@ bool ComputationOnlineOptimizer::FindFirstRepeat(
   // segments will normally be quite small (e.g. 10), and the comparison of
   // elements of 'normalized_active_pairs' should be fast in cases where they
   // differ.
+  KALDI_ASSERT(num_segments >= 2);
+
+  bool perform_time_offset_check = true;
+  if (normalized_active_pairs.back().empty()) {
+    // If there are no variables active after the end of the last-but-one segment
+    // (which is the last element in segment_ends, since we remove the end of the
+    // very last segment), then don't perform the check related to
+    // time-offsets, it's not relevant.  [this would probably be a computation
+    // that doesn't require any context].
+    perform_time_offset_check = false;
+  }
   for (int32 s = 0; s < num_segments; s++) {
     for (int32 t = s + 1; t < num_segments; t++) {
-      if (time_offsets[t] - time_offsets[s] == (t - s) * time_shift_per_segment
-          && normalized_active_pairs[s] == normalized_active_pairs[t]) {
+      if ((!perform_time_offset_check ||
+           time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) &&
+          normalized_active_pairs[s] == normalized_active_pairs[t]) {
         *seg1 = s;
         *seg2 = t;
         return true;
@@ -2059,6 +2113,114 @@ void ComputationOnlineOptimizer::CheckIdentifiedMatrices(
 }
 
 
+// static
+void ComputationOnlineOptimizer::GetMatrixSwapOrder(
+    const std::vector<int32> &matrices1,
+    const std::vector<int32> &matrices2,
+    std::vector<std::pair<int32, int32> > *swaps) {
+  KALDI_ASSERT(matrices1.size() == matrices2.size());
+  swaps->clear();
+  int32 num_matrices = matrices1.size();
+  std::vector<bool> processed(num_matrices, false);
+  std::vector<int32> queue;
+
+  // num_loops is just for infinite-loop detection.
+  int32 num_loops = 0;
+  for (; static_cast<int32>(swaps->size()) < num_matrices; num_loops++) {
+    for (int32 i = 0; i < num_matrices; i++) {
+      if (processed[i])
+        continue;
+      int32 m1 = matrices1[i], m2 = matrices2[i];
+      std::vector<int32>::const_iterator iter =
+          std::lower_bound(matrices2.begin(), matrices2.end(), m1);
+      if (iter == matrices2.end() || *iter != m1) {
+        // Matrix m1 does not appear in the list 'matrices2', so
+        // we are safe to process it at any time.
+        swaps->push_back(std::pair<int32,int32>(m1, m2));
+        processed[i] = true;
+      } else {
+        int32 m1_pos_in_matrices2 = iter - matrices2.begin();
+        if (processed[m1_pos_in_matrices2]) {
+          // We're safe to do this swap now, because the matrix m1 has already
+          // appeared on the RHS of a swap, and by this point has been
+          // deallocated, in effect.
+          swaps->push_back(std::pair<int32,int32>(m1, m2));
+          processed[i] = true;
+        }
+        // else do nothing, we cannot process m1 yet because
+        // at this point in the computation it is still allocated.
+      }
+    }
+    // The following assert is to check that we don't loop infinitely.  We can
+    // prove that infinite looping won't happen, after on proving that there can
+    // be no cycles like (m1, m2), (m2, m3), (m3, m1) (the length of 3 is chosen
+    // arbitrarily as an example).  If such a cycle existed, we can reach a
+    // contradiction based on the time-index (t) of the first cindex in m1.
+    // Define t1 = that time index, t2 the same for m2, t3 the same for m3.  The
+    // existence of the three pairs [as pairs like (matrices1[i], matrices2[i])]
+    // implies that t2 > t1, t3 > t2, and t1 > t3 respectively, but this is
+    // impossible.
+    // This shows that all chains of dependencies must terminate.
+    KALDI_ASSERT(num_loops <= num_matrices);
+  }
+}
+
+// static
+void ComputationOnlineOptimizer::AddMatrixSwapCommands(
+    const std::vector<int32> &matrices1,
+    const std::vector<int32> &matrices2,
+    NnetComputation *computation) {
+  std::vector<std::pair<int32, int32> > swaps;
+  // Note: in 'easy' cases where matrices1 and matrices2 are disjoint,
+  // 'swaps' will just be the vector { (matrices1[0],matrices2[0]),
+  // (matrices1[1],matrices2[1]), ... },
+  // but in some cases these may need to get reordered.
+  GetMatrixSwapOrder(matrices1, matrices2, &swaps);
+
+  NnetComputation::Command goto_label_command = computation->commands.back();
+  KALDI_ASSERT(goto_label_command.command_type == kGotoLabel);
+  computation->commands.pop_back();
+
+  // the following vector gives us, for each matrix index, a submatrix index
+  // that covers the whole of that matrix (needed because the commands
+  // require submatrix indexes)
+  std::vector<int32> whole_submatrices;
+  computation->GetWholeSubmatrices(&whole_submatrices);
+  size_t num_matrices = whole_submatrices.size();
+
+  for (size_t i = 0; i < swaps.size(); i++) {
+    int32 m1 = swaps[i].first, m2 = swaps[i].second;
+    KALDI_ASSERT(static_cast<size_t>(m1) < num_matrices &&
+                 static_cast<size_t>(m2) < num_matrices);
+    int32 s1 = whole_submatrices[m1], s2 = whole_submatrices[m2];
+    computation->commands.push_back(
+        NnetComputation::Command(
+            kAllocMatrixFromOther, s1, s2));
+  }
+  computation->commands.push_back(goto_label_command);
+}
+
+// static
+void ComputationOnlineOptimizer::FormInfiniteLoop(
+    int32 command1, int32 command2,
+    NnetComputation *computation) {
+  KALDI_ASSERT(static_cast<int32>(computation->commands.size()) >=
+               command2 + 1 && command1 < command2);
+  KALDI_ASSERT(
+      computation->commands[command1].command_type == kNoOperationMarker &&
+      computation->commands[command2].command_type == kNoOperationMarker);
+  // Remove any commands after 'command2'.
+  computation->commands.resize(command2 + 1);
+  computation->commands[command2].command_type = kGotoLabel;
+  computation->commands[command2].arg1 = command1;
+  NnetComputation::Command c(kNoOperationLabel);
+  computation->commands.insert(computation->commands.begin() + command1,
+                               c);
+  // Now the kNoOperationLabel command is at position 'command1'.
+}
+
+
+
 bool ComputationOnlineOptimizer::Optimize() {
   analyzer_.Init(nnet_, *computation_);
   KALDI_ASSERT(!computation_->matrix_debug_info.empty() &&
@@ -2105,8 +2267,9 @@ bool ComputationOnlineOptimizer::Optimize() {
   std::vector<int32> time_offsets;
   NormalizePairLists(&pair_lists, &time_offsets);
 
+  // Note: seg1 and seg2 are indexes into 'segment_ends', representing
+  // points in time (that happen to be the ends of segments).
   int32 seg1, seg2;
-
   if (!FindFirstRepeat(pair_lists,
                        time_offsets,
                        time_shift_per_segment,
@@ -2128,20 +2291,45 @@ bool ComputationOnlineOptimizer::Optimize() {
   CheckIdentifiedMatrices(*computation_, seg1_matrices, seg2_matrices,
                           time_difference);
 
-  // HERE, do whatever kind of identification we have to do between the two
-  // lists of matrices.
+
+  FormInfiniteLoop(segment_ends[seg1], segment_ends[seg2], computation_);
+
+  AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_);
+
+  RenumberComputation(computation_);
+
+  FixGotoLabel(computation_);
 
   return true;
 }
 
 
-bool OptimizeOnlineComputation(const Nnet &nnet,
+void OptimizeOnlineComputation(const Nnet &nnet,
                                NnetComputation *computation) {
   ComputationOnlineOptimizer optimizer(nnet, computation);
-  return optimizer.Optimize();
+  optimizer.Optimize();
 }
 
 
+void FixGotoLabel(NnetComputation *computation) {
+  int32 num_commands = computation->commands.size();
+  if (num_commands == 0)
+    return;
+  if (computation->commands[num_commands-1].command_type == kGotoLabel) {
+    int32 dest_command = computation->commands[num_commands-1].arg1;
+    if (static_cast<size_t>(dest_command) <  computation->commands.size() &&
+        computation->commands[dest_command].command_type == kNoOperationLabel)
+      return;  // nothing to fix.
+    for (int32 c = 0; c + 1 < num_commands; c++) {
+      if (computation->commands[c].command_type == kNoOperationLabel) {
+        computation->commands[num_commands-1].arg1 = c;
+        return;
+      }
+    }
+    KALDI_ERR << "Label not found.";
+  }
+}
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 11a04354016..f2448f46fe5 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -574,12 +574,20 @@ void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
 /// matrices have the same debug-info other than a time offset and can be
 /// identified with each other, and the no-op command at c2 can be replaced with
 /// 'got c1', creating a computation that 'goes on forever'.
-/// It returns true if it successfully did this.  [If this happens, the
-/// whole computation may have to be regenerated with more segments.]
-bool OptimizeOnlineComputation(const Nnet &nnet,
+/// If it can't do this, it does nothing.  You can figure out that this is the
+/// case by checking whether kGotoLabel is the last command in the computation.
+/// [If this optimization fails, the whole computation may have to be
+/// regenerated with more segments.]
+void OptimizeOnlineComputation(const Nnet &nnet,
                                NnetComputation *computation);
 
 
+/// This function ensures that the arg1 of a final command of type kGotoLabel is
+/// the same as the command with type kNoOperationLabel.  This is necessary
+/// if you do any other type of optimization after 'OptimizeOnlineComputation()'.
+void FixGotoLabel(NnetComputation *computation);
+
+
 /*
 
    Possible TODO:
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 602aa1b2f86..480ed5cd41f 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -336,6 +336,7 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
           kAllocMatrixFromOtherZeroed;
   }
   RemoveNoOps(computation);
+  FixGotoLabel(computation);
 }
 
 
@@ -476,7 +477,11 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, false);
   }
 
-  if (config.optimize && config.allocate_from_other) {
+  if (config.optimize && config.allocate_from_other &&
+      !config.optimize_online_computation) {
+    // Don't do this if it's an online computation because we're not sure if it
+    // would be correct in that case, as written.  In any case the performance
+    // benefit is tiny.
     RemoveUnnecessaryAllocation(nnet, computation);
     if (GetVerboseLevel() >= 4)
       CheckComputation(nnet, *computation, false);
@@ -488,6 +493,9 @@ void Optimize(const NnetOptimizeOptions &config,
   // other optimizations.)
   ConsolidateIoOperations(nnet, computation);
 
+  if (config.optimize_online_computation)
+    FixGotoLabel(computation);
+
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, *computation, false);
 }
@@ -661,10 +669,10 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
   return computation;
 }
 
-/// Split the computation up into segments bounded internally by kNoOperationMarker.
-/// For each segment, a pair of command-indexes (start, end) is output to the vector
-/// 'segments', so the commands in the segment (not including kNoOperationMarker)
-/// are numbered from start ... end - 1.
+/// Split the computation up into segments bounded by kNoOperationMarker.  For
+/// each segment, a pair of command-indexes (start, end) is output to the vector
+/// 'segments', so the commands in the segment (not including
+/// kNoOperationMarker) are numbered from start ... end - 1.
 static void SplitComputationIntoSegments(
     const NnetComputation &computation,
     std::vector<std::pair<int32, int32> > *segments) {
@@ -684,6 +692,10 @@ static void SplitComputationIntoSegments(
 
 void ConsolidateIoOperations(const Nnet &nnet,
                              NnetComputation *computation) {
+  bool ends_with_goto =
+      (!computation->commands.empty() &&
+       computation->commands.back().command_type == kGotoLabel);
+
   // These segments, represented as (start-index, end-index),
   // are segments of the computation separated by kNoOperationMarker.
   std::vector<std::pair<int32, int32> > segments;
@@ -732,6 +744,24 @@ void ConsolidateIoOperations(const Nnet &nnet,
     KALDI_ASSERT(c == segment_end);
   }
   computation->commands.swap(reordered_commands);
+
+  if (ends_with_goto) {
+    // If, before this operation, the last command was kGotoLael, remove all
+    // commands that have been reordered to go after the kGotoLabel command
+    // [they would be unreachable anyway.]  This relates to online computations.
+    // It may seem wrong that we are just removing these
+    // kAcceptInput/kProvideOutput commands, but the reason it's OK
+    // (and preserves equivalence with the code prior to this function call),
+    // is that the corresponding commands have also been moved past the
+    // kNoOperationLabel command that the goto jumps to, so those commands
+    // will actually get run.
+    // We don't actually check this here (it would lead to a crash when
+    // the computation was executed, if something is wrong in this logic).
+    while (!computation->commands.empty() &&
+           computation->commands.back().command_type != kGotoLabel)
+      computation->commands.pop_back();
+    FixGotoLabel(computation);
+  }
 }
 
 

From abdd595abb319b5ad22e81f122cb20785bd731fc Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 10 Nov 2016 22:37:03 -0500
Subject: [PATCH 015/213] Add a couple of previously omitted files

---
 src/nnet3/nnet-compile-online.cc | 336 +++++++++++++++++++++++++++++++
 src/nnet3/nnet-compile-online.h  | 181 +++++++++++++++++
 2 files changed, 517 insertions(+)
 create mode 100644 src/nnet3/nnet-compile-online.cc
 create mode 100644 src/nnet3/nnet-compile-online.h

diff --git a/src/nnet3/nnet-compile-online.cc b/src/nnet3/nnet-compile-online.cc
new file mode 100644
index 00000000000..21430d79bbc
--- /dev/null
+++ b/src/nnet3/nnet-compile-online.cc
@@ -0,0 +1,336 @@
+// nnet3/nnet-compile-online.cc
+
+// Copyright      2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-compile-online.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void ModifyNnetIvectorPeriod(int32 ivector_period,
+                             Nnet *nnet) {
+  KALDI_ASSERT(ivector_period > 0);
+  std::vector<std::string> config_lines;
+  nnet->GetConfigLines(false, &config_lines);
+  std::ostringstream config_to_read;
+  for (size_t i = 0; i < config_lines.size(); i++) {
+    std::string s = config_lines[i];
+    ConfigLine config_line;
+    bool b = config_line.ParseLine(config_lines[i]);
+    KALDI_ASSERT(b && "Could not parse config line.");
+    if (config_line.FirstToken() == "component-node") {
+      std::string whole_line = config_lines[i];
+      std::string to_search_for = "ReplaceIndex(ivector, t, 0)";
+      std::string::size_type pos = whole_line.find(to_search_for);
+      if (pos != std::string::npos) {
+        std::ostringstream to_replace_with;
+        to_replace_with << "Round(ivector, " << ivector_period << ")";
+        whole_line.replace(pos, to_search_for.size(), to_replace_with.str());
+        config_to_read << whole_line << "\n";
+      }
+    }
+  }
+  if (!config_to_read.str().empty()) {
+    std::istringstream is(config_to_read.str());
+    nnet->ReadConfig(is);
+  }
+}
+
+
+int32 GetChunkSize(const Nnet &nnet,
+                   int32 frame_subsampling_factor,
+                   int32 advised_chunk_size) {
+  int32 modulus = nnet.Modulus();
+  KALDI_ASSERT(modulus > 0 && frame_subsampling_factor > 0 &&
+               advised_chunk_size > 0);
+  int32 chunk_size = advised_chunk_size;
+  while (1) {
+    if (chunk_size % modulus == 0 &&
+        chunk_size % frame_subsampling_factor == 0)
+      return chunk_size;
+    chunk_size++;
+  }
+}
+
+
+/// Mod(m, n), defined for integers m and n where n > 0, returns
+/// the modulus m % n, defined as the integer 0 <= i < n
+/// such that i and m are congruent modulo n; for instance,
+/// Mod(13, 10) = 3.
+/// This is like the % operation in C/C++, except that it always returns a
+/// positive value even for negative m; in 99% of cases where it makes a
+/// difference, this is what you want.  In the C/C++ standard, the sign of a % b
+/// for negative a is not specified (except by relation with the division '/'
+/// operator), but in practice it would be <= 0 for almost all implementations.
+template<class I> I  Mod(I m, I n) {
+  if (m >= 0) return m % n;
+  else return -((-m) % n);
+}
+
+
+static void CreateComputationRequestInternal(
+    int32 begin_input_t, int32 end_input_t,
+    int32 begin_output_t, int32 end_output_t,
+    int32 num_sequences,
+    int32 frame_subsampling_factor,
+    const std::set<int32> &ivector_times,
+    ComputationRequest *request) {
+  request->inputs.reserve(2);
+  request->inputs.clear();
+  request->inputs.resize(1 + (ivector_times.empty() ? 0 : 1));
+  request->inputs[0].name = "input";
+  request->inputs[0].has_deriv = false;
+  request->outputs.clear();
+  request->outputs.resize(1);
+  request->outputs[0].name = "output";
+  request->outputs[0].has_deriv = false;
+  if (!ivector_times.empty()) {
+    request->inputs[1].name = "ivector";
+    request->inputs[1].has_deriv = false;
+  }
+
+  // in the computation request the 'n' indexes (the sequence/utterance indexes)
+  // have the larger stride than 't', although this is opposite to the way it's
+  // done inside the computation.  This is for user convenience where it may be
+  // easier to deal with submatrixes per sequence.
+  for (int32 n = 0; n < num_sequences; n++) {
+    int32 x = 0;
+    for (int32 t = begin_input_t; t < end_input_t; t++) {
+      request->inputs[0].indexes.push_back(Index(n, t, x));
+    }
+    for (int32 t = begin_output_t;
+         t < end_output_t;
+         t += frame_subsampling_factor)
+      request->outputs[0].indexes.push_back(Index(n, t, x));
+  }
+  if (!ivector_times.empty()) {
+    request->inputs.resize(2);
+    request->inputs[1].name = "ivector";
+    request->inputs[1].has_deriv = false;
+    for (int32 n = 0; n < num_sequences; n++) {
+      // note: std::sets store things in sorted order.
+      for (std::set<int32>::const_iterator iter = ivector_times.begin();
+           iter != ivector_times.end(); ++iter) {
+        int32 t = *iter, x = 0;
+        request->inputs[1].indexes.push_back(Index(n, t, x));
+      }
+    }
+  }
+}
+
+
+void CreateOnlineComputationRequestSimple(const Nnet &nnet,
+                                          int32 chunk_size,
+                                          int32 frame_subsampling_factor,
+                                          int32 ivector_period,
+                                          int32 extra_left_context_begin,
+                                          int32 extra_right_context,
+                                          int32 num_sequences,
+                                          ComputationRequest *request1,
+                                          ComputationRequest *request2,
+                                          ComputationRequest *request3) {
+  bool has_ivector = (nnet.InputDim("ivector") > 0);
+  int32 left_context, right_context;
+  ComputeSimpleNnetContext(nnet, &left_context, &right_context);
+  KALDI_ASSERT(chunk_size % frame_subsampling_factor == 0 &&
+               chunk_size % nnet.Modulus() == 0 &&
+               chunk_size % ivector_period == 0);
+  KALDI_ASSERT(extra_left_context_begin >= 0 && extra_right_context >= 0);
+  // note, 'end' is one past the last one.
+  int32 chunk1_input_begin_t = - left_context - extra_left_context_begin,
+      chunk1_input_end_t = chunk_size + right_context + extra_right_context,
+      chunk2_input_begin_t = chunk1_input_end_t,
+      chunk2_input_end_t = chunk2_input_begin_t + chunk_size,
+      chunk3_input_begin_t = chunk2_input_end_t,
+      chunk3_input_end_t = chunk3_input_begin_t + chunk_size;
+
+
+  // work out the times at which i-vectors are required.
+  std::set<int32> ivector_times1, ivector_times2, ivector_times3;
+  if (has_ivector) {
+    for (int32 t = chunk1_input_begin_t; t < chunk1_input_end_t; t++) {
+      int32 ivector_t = t - Mod(t, ivector_period);
+      ivector_times1.insert(ivector_t);
+    }
+    for (int32 t = chunk2_input_begin_t; t < chunk2_input_end_t; t++) {
+      int32 ivector_t = t - Mod(t, ivector_period);
+      if (ivector_times1.count(ivector_t) == 0)
+        ivector_times2.insert(ivector_t);
+    }
+    for (int32 t = chunk3_input_begin_t; t < chunk3_input_end_t; t++) {
+      int32 ivector_t = t - Mod(t, ivector_period);
+      if (ivector_times1.count(ivector_t) == 0 &&
+          ivector_times2.count(ivector_t) == 0) {
+        ivector_times3.insert(ivector_t);
+      }
+    }
+  }
+
+  CreateComputationRequestInternal(
+      chunk1_input_begin_t, chunk1_input_end_t,
+      0, chunk_size,
+      num_sequences, frame_subsampling_factor,
+      ivector_times1,
+      request1);
+
+  CreateComputationRequestInternal(
+      chunk2_input_begin_t, chunk2_input_end_t,
+      chunk_size, chunk_size * 2,
+      num_sequences, frame_subsampling_factor,
+      ivector_times2,
+      request2);
+
+  CreateComputationRequestInternal(
+      chunk3_input_begin_t, chunk3_input_end_t,
+      chunk_size * 2, chunk_size * 3,
+      num_sequences, frame_subsampling_factor,
+      ivector_times3,
+      request3);
+
+}
+
+
+
+void AddTimeOffsetToComputationRequest(int32 t_offset,
+                                       ComputationRequest *request) {
+  for (size_t i = 0; i < request->inputs.size(); i++) {
+    size_t size = request->inputs[i].indexes.size();
+    for (size_t j = 0; j < size; j++)
+      request->inputs[i].indexes[j].t += t_offset;
+  }
+  for (size_t i = 0; i < request->outputs.size(); i++) {
+    size_t size = request->outputs[i].indexes.size();
+    for (size_t j = 0; j < size; j++)
+      request->outputs[i].indexes[j].t += t_offset;
+  }
+}
+
+
+
+static bool ExtrapolateComputationRequest(
+    const ComputationRequest &request1,
+    const ComputationRequest &request2,
+    ComputationRequest *request3) {
+  // accepts two computation requests 'request1' and 'request2' that
+  // must be identical except for a time offset, and creates 'request3'
+  // that is the extrapolation of the next term in sequence.
+  *request3 = request2;
+  KALDI_ASSERT(!request1.inputs.empty() && !request1.inputs[0].indexes.empty() &&
+               !request2.inputs.empty() && !request2.inputs[0].indexes.empty());
+  int32 t_offset = request2.inputs[0].indexes[0].t -
+      request1.inputs[0].indexes[0].t;
+  // the following is just to make sure that the inputs are structurally
+  // equivalent.
+  AddTimeOffsetToComputationRequest(-t_offset, request3);
+  if (!(*request3 == request1))
+    return false;  // there is somse structural difference, or
+                   // the time offset is not consistent.
+  // the following reverses the last call to AddTimeOffsetToComputationRequest,
+  // then adds the offset we want.
+  AddTimeOffsetToComputationRequest(2 * t_offset, request3);
+  return true;
+}
+
+
+/* Internal version of CompileOnline where
+   you specify the the number of computation requests (must be >= 3).
+   Returns true on success.
+   It's possible for the optimization to fail if you give too small
+   a value of 'num_requests' (this depends on the network topology),
+   and in that case this function will return false and you should re-try
+   with a higher value of num_requests.
+ */
+static bool CompileOnlineInternal(
+    const Nnet &nnet,
+    NnetOptimizeOptions optimize_opts,
+    const ComputationRequest &request1,
+    const ComputationRequest &request2,
+    const ComputationRequest &request3,
+    int32 num_requests,
+    NnetComputation *computation) {
+  KALDI_ASSERT(num_requests >= 3);
+  std::vector<ComputationRequest> extra_requests(num_requests - 3);
+  const ComputationRequest *prev_request = &request2;
+  const ComputationRequest *cur_request = &request3;
+  for (int32 i = 0; i < num_requests - 3; i++) {
+    if (!ExtrapolateComputationRequest(*prev_request, *cur_request,
+                                       &(extra_requests[i]))) {
+      KALDI_LOG << "prev_request is:";
+      prev_request->Print(std::cerr);
+      KALDI_LOG << "cur_request is:";
+      cur_request->Print(std::cerr);
+      KALDI_ERR << "Computation requests do not have the right relationship";
+    }
+    prev_request = cur_request;
+    cur_request = &(extra_requests[i]);
+  }
+
+  std::vector<const ComputationRequest*> requests;
+  requests.push_back(&request1);
+  requests.push_back(&request2);
+  requests.push_back(&request3);
+  for (int32 i = 0; i < num_requests - 3; i++)
+    requests.push_back(&(extra_requests[i]));
+  Compiler compiler(requests, nnet);
+  CompilerOptions compiler_opts;
+  compiler.CreateComputation(compiler_opts, computation);
+  optimize_opts.optimize_online_computation = true;
+
+  Optimize(optimize_opts, nnet, computation);
+
+  return computation->commands.size() != 0 &&
+      computation->commands.back().command_type == kGotoLabel;
+}
+
+void CompileOnline(const Nnet &nnet,
+                   const NnetOptimizeOptions &optimize_opts,
+                   const ComputationRequest &request1,
+                   const ComputationRequest &request2,
+                   const ComputationRequest &request3,
+                   NnetComputation *computation) {
+  int32 num_requests1 = 5, factor = 2, max_requests = 100,
+      num_requests;
+
+  for (num_requests = num_requests1; num_requests <= max_requests;
+       num_requests *= factor) {
+    if (CompileOnlineInternal(nnet, optimize_opts,
+                             request1, request2, request3,
+                             num_requests, computation)) {
+      return;
+    } else {
+      KALDI_VLOG(2) << "Online compilation failed with "
+                    << num_requests << " requests, trying "
+                    << (num_requests * factor);
+    }
+  }
+  KALDI_ERR << "Online compilation failed with "
+            << (num_requests/factor) << " requests, which "
+            << "we expect should be enough... something "
+            << "went wrong.";
+}
+
+
+
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-compile-online.h b/src/nnet3/nnet-compile-online.h
new file mode 100644
index 00000000000..100c741fe82
--- /dev/null
+++ b/src/nnet3/nnet-compile-online.h
@@ -0,0 +1,181 @@
+// nnet3/nnet-compile-online.h
+
+// Copyright      2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_COMPILE_ONLINE_H_
+#define KALDI_NNET3_NNET_COMPILE_ONLINE_H_
+
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-utils.h"
+
+#include <list>
+
+namespace kaldi {
+namespace nnet3 {
+
+
+/**
+   CompileOnline() provides an internal interface for 'online' computation.
+   It's usable for inference only (not training), meaning that backprop is
+   not supported (for now, at least).  CompileOnline() allows you to do the
+   neural net computation for small chunks with increasing 't' values, and
+   naturally cache the intermediate activations (rather than recomputing them
+   every time you see new input data).
+
+   This function does both compilation and optimization, so it's like a combination of
+   Compiler::CreateComputation() [nnet-compile.h] and Optimize() [nnet-optimize.h].
+
+   You provide 3 computation requests.  request1 is the first computation
+   request of an utterance (or other type of segment) that contains any required
+   extra left context in the input.  request2 and request3 are the second and
+   third computation request, and must have exactly the same structure, except
+   for a fixed time offset (change in 't' index) between them.  This will be
+   extrapolated to an infinite sequence of further requests (request4,
+   request5, etc.).  In practice the way it's done is that we extrapolate
+   to a small finite number of requests (like 10), and then attempt to
+   identify a common structure in the computation where, after processing,
+   as an example, the 3nd computation request, the active variables can
+   be identified with those present at, say, the 7th computation request, and
+   we then cut and splice the computation together at this points, like
+   making a tape loop, by adding a goto statement that jumps from the end of
+   the 7th computation request to the end of the 3rd computation request.
+   We also have to identify the variables with each other (merge variables).
+
+   That's done in the optimization code.
+ */
+void CompileOnline(const Nnet &nnet,
+                   const NnetOptimizeOptions &optimize_opts,
+                   const ComputationRequest &request1,
+                   const ComputationRequest &request2,
+                   const ComputationRequest &request3,
+                   NnetComputation *computation);
+
+/*
+  This function gives you a suitable chunk size, which is the smallest number >=
+  'advised_chunk_size' that is an exact multiple of nnet.Modulus() and
+  frame_subsampling_factor.  This will ensure that all the chunks have the same
+  structure, which makes compiling the online computation a little more
+  straightforward.
+ */
+int32 GetChunkSize(const Nnet &nnet,
+                   int32 frame_subsampling_factor,
+                   int32 advised_chunk_size);
+
+/**
+   This function modifies the descriptors in the neural network to change the
+   periodicity with which it expects to read an iVector at its input.
+
+   We normally train neural networks that expect to see an iVector at frame zero
+   only; this is because we train on fixed-size chunks and the iVector doesn't
+   change that much within each chunk.  However, expecting just one iVector
+   isn't that convenient for online recognition because it changes with
+   time, so we modify the iVector input period in the network by replacing
+   expressions like ReplaceIndex(ivector, t, 0) or just "t", with
+   Round(ivector, 10) [assuming ivector_period == 10].  This won't work
+   in every conceivable network, but it does do what you want in the
+   cases of interest.
+
+   It does this in a rather simple way, by getting the config lines that
+   correspond to descriptors, and doing a search-and-replace.  It's
+   maybe not ideal, but it was the easiest way to do it.
+
+ */
+void ModifyNnetIvectorPeriod(int32 ivector_period,
+                             Nnet *nnet);
+
+/**
+  This function creates computation request suitable for giving to ComputeOnline().
+  It's intended for use with a 'simple' nnet (one satisfying IsSimpleNnet()), and this
+  basically means that the inputs must be named "input" and possibly "ivector",
+  and that there is an output named "output", and that those are the ones you
+  care about (it won't generate any other outputs or use any other inputs).
+
+  If you want to use online computation for different types of neural net, you
+  should use the deeper interface, CompileOnline().
+
+   @param [in] nnet   The neural net this computation request is to be used with.
+               This is used to check whether the neural net accepts iVectors,
+               and to work out the left-context and right-context required
+               by the network.
+   @param [in] chunk_size  The number of frames of output that will be generated
+               for each chunk (note: this is the shift in the t-index, which will not
+               equal the number of output frames if frame_subsampling_factor != 1).
+               Note: it is required that chunk_size be a multiple of ivector_period,
+               frame_subsampling_factor, and nnet.Modulus().  You should use
+               GetChunkSize() to compute the chunk size, giving it an advisory/
+               minimum chunksize, to make sure it satisfies these properties.
+   @param [in] frame_subsampling_factor  This will normally be 1, but may be
+               more than 1 (e.g. 3) in chain systems; it determines the frame-skipping
+               on the output, so we evaluate the output with 't' at multiples of
+               this value.
+   @param [in] ivector_period The period with which iVectors are to be supplied
+               to the network (if you're using iVectors).  Not necessarily the
+               same as the period with which the ivectors are extracted or
+               stored on disk (--online-ivector-period).  You will normally set
+               this to the chunk size.  It must divide the chunk size (if you're
+               using iVectors) Note: you should call ModifyNnetIvectorPeriod on
+               'nnet' before calling this function; otherwise the neural net
+               will most likely not actually be able to consume the iVector with
+               this frequency.
+   @param [in] extra_left_context_begin  The additional left-context that
+               should be supplied to the network on top of the minimum
+               that the network requires.  We call this extra_left_context_begin
+               because this only relates to the start of the utterance (t=0).
+   @param [in] num_sequences  The number of separate 'n' values to put in the computation;
+               normally this will be just 1, but it can be increased to allow
+               simultaneous operation on multiple streams of input.
+   @param [out] request1 The first of the 3 requests that this function
+               generates, that the user should then supply to CompileOnline().
+               Note: this will tend to be the largest computation request in
+               terms of input, because we have to provide enough left and right
+               context that it can evaluate the first chunk.  Note: as
+               elsewhere, the job of duplicating first and last frames enough to
+               provide the required left/right context to the network, is left
+               to the caller (at runtime, not during compilation).
+   @param [out] request2  The second of the 3 requests that this function generates.
+               Caution: none of the inputs and outputs should overlap.
+   @param [out] request3  The third of the 3 requests that this function generates.
+                It will be the same as request2, except for a time offset.
+*/
+void CreateOnlineComputationRequestSimple(const Nnet &nnet,
+                                          int32 chunk_size,
+                                          int32 frame_subsampling_factor,
+                                          int32 ivector_period,
+                                          int32 extra_left_context_begin,
+                                          int32 extra_right_context,
+                                          int32 num_sequences,
+                                          ComputationRequest *request1,
+                                          ComputationRequest *request2,
+                                          ComputationRequest *request3);
+
+struct NnetSimpleOnlineComputationOptions {
+
+};
+
+void CreateLoopedComputationSimple(
+    const Nnet &nnet, // ... TODO...
+                                   );
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+
+#endif

From 10d6a1a34407a90e478dd491dbce7e097d2a031e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 10 Nov 2016 23:15:22 -0500
Subject: [PATCH 016/213] Change name from online to looped (less confusable)

---
 src/nnet3/Makefile                            |  2 +-
 ...mpile-online.cc => nnet-compile-looped.cc} | 20 +++++------
 ...compile-online.h => nnet-compile-looped.h} | 32 ++++++++---------
 src/nnet3/nnet-compile-test.cc                | 16 ++++-----
 src/nnet3/nnet-optimize-utils.cc              | 36 +++++++++----------
 src/nnet3/nnet-optimize-utils.h               |  6 ++--
 src/nnet3/nnet-optimize.cc                    | 16 ++++-----
 src/nnet3/nnet-optimize.h                     |  8 ++---
 8 files changed, 68 insertions(+), 68 deletions(-)
 rename src/nnet3/{nnet-compile-online.cc => nnet-compile-looped.cc} (96%)
 rename src/nnet3/{nnet-compile-online.h => nnet-compile-looped.h} (92%)

diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 60629ab1cbe..8dfa3120fac 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -28,7 +28,7 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   discriminative-supervision.o nnet-discriminative-example.o \
   nnet-discriminative-diagnostics.o \
   discriminative-training.o nnet-discriminative-training.o \
-  online-nnet3-decodable-simple.o nnet-compile-online.o
+  online-nnet3-decodable-simple.o nnet-compile-looped.o
 
 
 LIBNAME = kaldi-nnet3
diff --git a/src/nnet3/nnet-compile-online.cc b/src/nnet3/nnet-compile-looped.cc
similarity index 96%
rename from src/nnet3/nnet-compile-online.cc
rename to src/nnet3/nnet-compile-looped.cc
index 21430d79bbc..71329d2e8fe 100644
--- a/src/nnet3/nnet-compile-online.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -1,4 +1,4 @@
-// nnet3/nnet-compile-online.cc
+// nnet3/nnet-compile-looped.cc
 
 // Copyright      2016  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "nnet3/nnet-compile-online.h"
+#include "nnet3/nnet-compile-looped.h"
 #include "nnet3/nnet-utils.h"
 
 namespace kaldi {
@@ -136,7 +136,7 @@ static void CreateComputationRequestInternal(
 }
 
 
-void CreateOnlineComputationRequestSimple(const Nnet &nnet,
+void CreateLoopedComputationRequestSimple(const Nnet &nnet,
                                           int32 chunk_size,
                                           int32 frame_subsampling_factor,
                                           int32 ivector_period,
@@ -249,7 +249,7 @@ static bool ExtrapolateComputationRequest(
 }
 
 
-/* Internal version of CompileOnline where
+/* Internal version of CompileLooped where
    you specify the the number of computation requests (must be >= 3).
    Returns true on success.
    It's possible for the optimization to fail if you give too small
@@ -257,7 +257,7 @@ static bool ExtrapolateComputationRequest(
    and in that case this function will return false and you should re-try
    with a higher value of num_requests.
  */
-static bool CompileOnlineInternal(
+static bool CompileLoopedInternal(
     const Nnet &nnet,
     NnetOptimizeOptions optimize_opts,
     const ComputationRequest &request1,
@@ -291,7 +291,7 @@ static bool CompileOnlineInternal(
   Compiler compiler(requests, nnet);
   CompilerOptions compiler_opts;
   compiler.CreateComputation(compiler_opts, computation);
-  optimize_opts.optimize_online_computation = true;
+  optimize_opts.optimize_looped_computation = true;
 
   Optimize(optimize_opts, nnet, computation);
 
@@ -299,7 +299,7 @@ static bool CompileOnlineInternal(
       computation->commands.back().command_type == kGotoLabel;
 }
 
-void CompileOnline(const Nnet &nnet,
+void CompileLooped(const Nnet &nnet,
                    const NnetOptimizeOptions &optimize_opts,
                    const ComputationRequest &request1,
                    const ComputationRequest &request2,
@@ -310,17 +310,17 @@ void CompileOnline(const Nnet &nnet,
 
   for (num_requests = num_requests1; num_requests <= max_requests;
        num_requests *= factor) {
-    if (CompileOnlineInternal(nnet, optimize_opts,
+    if (CompileLoopedInternal(nnet, optimize_opts,
                              request1, request2, request3,
                              num_requests, computation)) {
       return;
     } else {
-      KALDI_VLOG(2) << "Online compilation failed with "
+      KALDI_VLOG(2) << "Looped compilation failed with "
                     << num_requests << " requests, trying "
                     << (num_requests * factor);
     }
   }
-  KALDI_ERR << "Online compilation failed with "
+  KALDI_ERR << "Looped compilation failed with "
             << (num_requests/factor) << " requests, which "
             << "we expect should be enough... something "
             << "went wrong.";
diff --git a/src/nnet3/nnet-compile-online.h b/src/nnet3/nnet-compile-looped.h
similarity index 92%
rename from src/nnet3/nnet-compile-online.h
rename to src/nnet3/nnet-compile-looped.h
index 100c741fe82..00a97292798 100644
--- a/src/nnet3/nnet-compile-online.h
+++ b/src/nnet3/nnet-compile-looped.h
@@ -1,4 +1,4 @@
-// nnet3/nnet-compile-online.h
+// nnet3/nnet-compile-looped.h
 
 // Copyright      2016  Johns Hopkins University (author: Daniel Povey)
 
@@ -17,8 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_NNET3_NNET_COMPILE_ONLINE_H_
-#define KALDI_NNET3_NNET_COMPILE_ONLINE_H_
+#ifndef KALDI_NNET3_NNET_COMPILE_LOOPED_H_
+#define KALDI_NNET3_NNET_COMPILE_LOOPED_H_
 
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-utils.h"
@@ -30,9 +30,9 @@ namespace nnet3 {
 
 
 /**
-   CompileOnline() provides an internal interface for 'online' computation.
+   CompileLooped() provides an internal interface for 'looped' computation.
    It's usable for inference only (not training), meaning that backprop is
-   not supported (for now, at least).  CompileOnline() allows you to do the
+   not supported (for now, at least).  CompileLooped() allows you to do the
    neural net computation for small chunks with increasing 't' values, and
    naturally cache the intermediate activations (rather than recomputing them
    every time you see new input data).
@@ -58,7 +58,7 @@ namespace nnet3 {
 
    That's done in the optimization code.
  */
-void CompileOnline(const Nnet &nnet,
+void CompileLooped(const Nnet &nnet,
                    const NnetOptimizeOptions &optimize_opts,
                    const ComputationRequest &request1,
                    const ComputationRequest &request2,
@@ -69,7 +69,7 @@ void CompileOnline(const Nnet &nnet,
   This function gives you a suitable chunk size, which is the smallest number >=
   'advised_chunk_size' that is an exact multiple of nnet.Modulus() and
   frame_subsampling_factor.  This will ensure that all the chunks have the same
-  structure, which makes compiling the online computation a little more
+  structure, which makes compiling the looped computation a little more
   straightforward.
  */
 int32 GetChunkSize(const Nnet &nnet,
@@ -83,7 +83,7 @@ int32 GetChunkSize(const Nnet &nnet,
    We normally train neural networks that expect to see an iVector at frame zero
    only; this is because we train on fixed-size chunks and the iVector doesn't
    change that much within each chunk.  However, expecting just one iVector
-   isn't that convenient for online recognition because it changes with
+   isn't that convenient for looped recognition because it changes with
    time, so we modify the iVector input period in the network by replacing
    expressions like ReplaceIndex(ivector, t, 0) or just "t", with
    Round(ivector, 10) [assuming ivector_period == 10].  This won't work
@@ -99,14 +99,14 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
                              Nnet *nnet);
 
 /**
-  This function creates computation request suitable for giving to ComputeOnline().
+  This function creates computation request suitable for giving to ComputeLooped().
   It's intended for use with a 'simple' nnet (one satisfying IsSimpleNnet()), and this
   basically means that the inputs must be named "input" and possibly "ivector",
   and that there is an output named "output", and that those are the ones you
   care about (it won't generate any other outputs or use any other inputs).
 
-  If you want to use online computation for different types of neural net, you
-  should use the deeper interface, CompileOnline().
+  If you want to use looped computation for different types of neural net, you
+  should use the deeper interface, CompileLooped().
 
    @param [in] nnet   The neural net this computation request is to be used with.
                This is used to check whether the neural net accepts iVectors,
@@ -140,7 +140,7 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
                normally this will be just 1, but it can be increased to allow
                simultaneous operation on multiple streams of input.
    @param [out] request1 The first of the 3 requests that this function
-               generates, that the user should then supply to CompileOnline().
+               generates, that the user should then supply to CompileLooped().
                Note: this will tend to be the largest computation request in
                terms of input, because we have to provide enough left and right
                context that it can evaluate the first chunk.  Note: as
@@ -152,7 +152,7 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
    @param [out] request3  The third of the 3 requests that this function generates.
                 It will be the same as request2, except for a time offset.
 */
-void CreateOnlineComputationRequestSimple(const Nnet &nnet,
+void CreateLoopedComputationRequestSimple(const Nnet &nnet,
                                           int32 chunk_size,
                                           int32 frame_subsampling_factor,
                                           int32 ivector_period,
@@ -163,12 +163,12 @@ void CreateOnlineComputationRequestSimple(const Nnet &nnet,
                                           ComputationRequest *request2,
                                           ComputationRequest *request3);
 
-struct NnetSimpleOnlineComputationOptions {
-
+struct NnetSimpleLoopedComputationOptions {
+  // TODO
 };
 
 void CreateLoopedComputationSimple(
-    const Nnet &nnet, // ... TODO...
+    const Nnet &nnet // ... TODO...
                                    );
 
 
diff --git a/src/nnet3/nnet-compile-test.cc b/src/nnet3/nnet-compile-test.cc
index eaff78ad4c6..1b9c0d3e381 100644
--- a/src/nnet3/nnet-compile-test.cc
+++ b/src/nnet3/nnet-compile-test.cc
@@ -19,7 +19,7 @@
 
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-compile.h"
-#include "nnet3/nnet-compile-online.h"
+#include "nnet3/nnet-compile-looped.h"
 #include "nnet3/nnet-test-utils.h"
 
 namespace kaldi {
@@ -59,7 +59,7 @@ void UnitTestNnetCompile() {
 
 // this tests compilation where there are more than one
 // computation-request... this is to test some of the
-// low-level utilities that will be used in online computation.
+// low-level utilities that will be used in looped computation.
 void UnitTestNnetCompileMulti() {
   for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
@@ -117,7 +117,7 @@ void UnitTestNnetCompileMulti() {
 
 
 
-void UnitTestNnetCompileOnline() {
+void UnitTestNnetCompileLooped() {
   for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
     gen_config.allow_ivector = true;
@@ -146,7 +146,7 @@ void UnitTestNnetCompileOnline() {
     ModifyNnetIvectorPeriod(ivector_period, &nnet);
     KALDI_LOG << "Nnet info after modifying ivector period is: "
               << nnet.Info();
-    CreateOnlineComputationRequestSimple(
+    CreateLoopedComputationRequestSimple(
         nnet, chunk_size, frame_subsampling_factor,
         ivector_period, extra_left_context_begin, extra_right_context,
         num_sequences, &request1, &request2, &request3);
@@ -159,12 +159,12 @@ void UnitTestNnetCompileOnline() {
     request3.Print(std::cerr);
 
     NnetOptimizeOptions optimize_opts;
-    // todo: set optimize-online=true.
+    // todo: set optimize-looped=true.
     NnetComputation computation;
-    CompileOnline(nnet, optimize_opts,
+    CompileLooped(nnet, optimize_opts,
                   request1, request2, request3,
                   &computation);
-    KALDI_LOG << "Compiled online computation is ";
+    KALDI_LOG << "Compiled looped computation is ";
     computation.Print(std::cerr, nnet);
   }
 }
@@ -179,7 +179,7 @@ int main() {
   using namespace kaldi::nnet3;
   SetVerboseLevel(4);
 
-  UnitTestNnetCompileOnline();
+  UnitTestNnetCompileLooped();
   UnitTestNnetCompile();
   UnitTestNnetCompileMulti();
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 7a0fafb0b5e..4f9d3ec078c 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -1664,9 +1664,9 @@ void LimitDerivativeTimes(const Nnet &nnet,
 }
 
 
-class ComputationOnlineOptimizer {
+class ComputationLoopedOptimizer {
  public:
-  ComputationOnlineOptimizer(const Nnet &nnet,
+  ComputationLoopedOptimizer(const Nnet &nnet,
                              NnetComputation *computation):
       nnet_(nnet), computation_(computation) { }
   bool Optimize();
@@ -1834,7 +1834,7 @@ class ComputationOnlineOptimizer {
 
 
 // static
-int32 ComputationOnlineOptimizer::FindTimeShift(
+int32 ComputationLoopedOptimizer::FindTimeShift(
     const NnetComputation &computation,
     const std::vector<int32> &segment_ends) {
   KALDI_ASSERT(segment_ends.size() >= 3);
@@ -1884,7 +1884,7 @@ int32 ComputationOnlineOptimizer::FindTimeShift(
 }
 
 // static
-void ComputationOnlineOptimizer::CreateMatrixPairs(
+void ComputationLoopedOptimizer::CreateMatrixPairs(
     const NnetComputation &computation,
     std::vector<std::pair<int32, int32> > *matrix_to_pair) {
   typedef unordered_map<std::vector<Cindex>, int32,
@@ -1920,7 +1920,7 @@ void ComputationOnlineOptimizer::CreateMatrixPairs(
 }
 
 // static
-void ComputationOnlineOptimizer::GetPairToMatrixMap(
+void ComputationLoopedOptimizer::GetPairToMatrixMap(
       std::vector<std::pair<int32, int32> > &matrix_to_pair,
       unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix) {
   int32 num_matrices = matrix_to_pair.size();
@@ -1932,7 +1932,7 @@ void ComputationOnlineOptimizer::GetPairToMatrixMap(
 
 
 // static
-void ComputationOnlineOptimizer::ConvertListsToPairLists(
+void ComputationLoopedOptimizer::ConvertListsToPairLists(
       const std::vector<std::vector<int32> > &active_matrices,
       const std::vector<std::pair<int32, int32> > &matrix_to_pair,
       std::vector<std::vector<std::pair<int32, int32> > > *active_pairs) {
@@ -1956,7 +1956,7 @@ void ComputationOnlineOptimizer::ConvertListsToPairLists(
 }
 
 // static
-void ComputationOnlineOptimizer::NormalizePairLists(
+void ComputationLoopedOptimizer::NormalizePairLists(
     std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
     std::vector<int32> *time_offsets) {
   int32 num_segments = active_pairs->size();
@@ -1983,7 +1983,7 @@ void ComputationOnlineOptimizer::NormalizePairLists(
 
 
 // static
-bool ComputationOnlineOptimizer::FindFirstRepeat(
+bool ComputationLoopedOptimizer::FindFirstRepeat(
     const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
     const std::vector<int32> &time_offsets,
     int32 time_shift_per_segment,
@@ -2019,7 +2019,7 @@ bool ComputationOnlineOptimizer::FindFirstRepeat(
 }
 
 // static
-void ComputationOnlineOptimizer::PairListToMatrixList(
+void ComputationLoopedOptimizer::PairListToMatrixList(
     const std::vector<std::pair<int32, int32> > &pair_list,
     const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
     std::vector<int32> *matrix_list) {
@@ -2041,7 +2041,7 @@ void ComputationOnlineOptimizer::PairListToMatrixList(
 
 
 // static
-void ComputationOnlineOptimizer::FindActiveMatrices(
+void ComputationLoopedOptimizer::FindActiveMatrices(
     const NnetComputation &computation,
     const Analyzer &analyzer,
     const std::vector<int32> &segment_end_commands,
@@ -2079,7 +2079,7 @@ void ComputationOnlineOptimizer::FindActiveMatrices(
 }
 
 // static
-void ComputationOnlineOptimizer::CheckIdentifiedMatrices(
+void ComputationLoopedOptimizer::CheckIdentifiedMatrices(
     const NnetComputation &computation,
     const std::vector<int32> &list1,
     const std::vector<int32> &list2,
@@ -2114,7 +2114,7 @@ void ComputationOnlineOptimizer::CheckIdentifiedMatrices(
 
 
 // static
-void ComputationOnlineOptimizer::GetMatrixSwapOrder(
+void ComputationLoopedOptimizer::GetMatrixSwapOrder(
     const std::vector<int32> &matrices1,
     const std::vector<int32> &matrices2,
     std::vector<std::pair<int32, int32> > *swaps) {
@@ -2166,7 +2166,7 @@ void ComputationOnlineOptimizer::GetMatrixSwapOrder(
 }
 
 // static
-void ComputationOnlineOptimizer::AddMatrixSwapCommands(
+void ComputationLoopedOptimizer::AddMatrixSwapCommands(
     const std::vector<int32> &matrices1,
     const std::vector<int32> &matrices2,
     NnetComputation *computation) {
@@ -2201,7 +2201,7 @@ void ComputationOnlineOptimizer::AddMatrixSwapCommands(
 }
 
 // static
-void ComputationOnlineOptimizer::FormInfiniteLoop(
+void ComputationLoopedOptimizer::FormInfiniteLoop(
     int32 command1, int32 command2,
     NnetComputation *computation) {
   KALDI_ASSERT(static_cast<int32>(computation->commands.size()) >=
@@ -2221,11 +2221,11 @@ void ComputationOnlineOptimizer::FormInfiniteLoop(
 
 
 
-bool ComputationOnlineOptimizer::Optimize() {
+bool ComputationLoopedOptimizer::Optimize() {
   analyzer_.Init(nnet_, *computation_);
   KALDI_ASSERT(!computation_->matrix_debug_info.empty() &&
                "You must request matrix debug info when compiling "
-               "online computations.");
+               "looped computations.");
 
   // get the indexes of the separator commands at the ends of segments.
   std::vector<int32> segment_ends;
@@ -2304,9 +2304,9 @@ bool ComputationOnlineOptimizer::Optimize() {
 }
 
 
-void OptimizeOnlineComputation(const Nnet &nnet,
+void OptimizeLoopedComputation(const Nnet &nnet,
                                NnetComputation *computation) {
-  ComputationOnlineOptimizer optimizer(nnet, computation);
+  ComputationLoopedOptimizer optimizer(nnet, computation);
   optimizer.Optimize();
 }
 
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index f2448f46fe5..29f05add695 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -566,7 +566,7 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
-/// This function tries to optimize computation 'computation' for an 'online'
+/// This function tries to optimize computation 'computation' for an 'looped'
 /// computation.  It expects as input a computation with no backprop but with
 /// multiple 'segments' separated by command kNoOperation, where each segment
 /// corresponds to a new chunk of input and output.  It tries to locate a pair
@@ -578,13 +578,13 @@ void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
 /// case by checking whether kGotoLabel is the last command in the computation.
 /// [If this optimization fails, the whole computation may have to be
 /// regenerated with more segments.]
-void OptimizeOnlineComputation(const Nnet &nnet,
+void OptimizeLoopedComputation(const Nnet &nnet,
                                NnetComputation *computation);
 
 
 /// This function ensures that the arg1 of a final command of type kGotoLabel is
 /// the same as the command with type kNoOperationLabel.  This is necessary
-/// if you do any other type of optimization after 'OptimizeOnlineComputation()'.
+/// if you do any other type of optimization after 'OptimizeLoopedComputation()'.
 void FixGotoLabel(NnetComputation *computation);
 
 
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 480ed5cd41f..b4b38b5f736 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -468,18 +468,18 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, false);
   }
 
-  // the online computation optimization has to go before
+  // the looped computation optimization has to go before
   // 'RemoveUnnecessaryAllocation()'.  We don't gate this by 'config.optimize'
-  // because it's necessary for online computation to run.
-  if (config.optimize_online_computation){
-    OptimizeOnlineComputation(nnet, computation);
+  // because it's necessary for looped computation to run.
+  if (config.optimize_looped_computation){
+    OptimizeLoopedComputation(nnet, computation);
     if (GetVerboseLevel() >= 4)
       CheckComputation(nnet, *computation, false);
   }
 
   if (config.optimize && config.allocate_from_other &&
-      !config.optimize_online_computation) {
-    // Don't do this if it's an online computation because we're not sure if it
+      !config.optimize_looped_computation) {
+    // Don't do this if it's an looped computation because we're not sure if it
     // would be correct in that case, as written.  In any case the performance
     // benefit is tiny.
     RemoveUnnecessaryAllocation(nnet, computation);
@@ -493,7 +493,7 @@ void Optimize(const NnetOptimizeOptions &config,
   // other optimizations.)
   ConsolidateIoOperations(nnet, computation);
 
-  if (config.optimize_online_computation)
+  if (config.optimize_looped_computation)
     FixGotoLabel(computation);
 
   if (GetVerboseLevel() >= 4)
@@ -748,7 +748,7 @@ void ConsolidateIoOperations(const Nnet &nnet,
   if (ends_with_goto) {
     // If, before this operation, the last command was kGotoLael, remove all
     // commands that have been reordered to go after the kGotoLabel command
-    // [they would be unreachable anyway.]  This relates to online computations.
+    // [they would be unreachable anyway.]  This relates to looped computations.
     // It may seem wrong that we are just removing these
     // kAcceptInput/kProvideOutput commands, but the reason it's OK
     // (and preserves equivalence with the code prior to this function call),
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 303b08a4150..27871552017 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -46,10 +46,10 @@ struct NnetOptimizeOptions {
   bool allocate_from_other;
   int32 min_deriv_time;
   int32 max_deriv_time;
-  // optimize_online_computation is a 'hidden config' not available from
+  // optimize_looped_computation is a 'hidden config' not available from
   // the command line; it's set to true to enable the optimization for
-  // online computation that turns a linear computation into a loop.
-  bool optimize_online_computation;
+  // looped computation that turns a linear computation into a loop.
+  bool optimize_looped_computation;
 
   NnetOptimizeOptions(): optimize(true),
                          consolidate_model_update(true),
@@ -64,7 +64,7 @@ struct NnetOptimizeOptions {
                          allocate_from_other(true),
                          min_deriv_time(std::numeric_limits<int32>::min()),
                          max_deriv_time(std::numeric_limits<int32>::max()),
-                         optimize_online_computation(false) { }
+                         optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "

From 87f695b9999d34b7154c5e1d740baf880e0705c7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 13 Nov 2016 19:19:38 -0500
Subject: [PATCH 017/213] Finishing the decodable objects (not yet for online
 computatoin), and add tests, and debug to the extent that the tests succeed.

---
 src/nnet3/Makefile                    |   3 +-
 src/nnet3/decodable-simple-looped.cc  | 251 ++++++++++++++++++++
 src/nnet3/decodable-simple-looped.h   | 328 ++++++++++++++++++++++++++
 src/nnet3/nnet-am-decodable-simple.cc |  43 ++--
 src/nnet3/nnet-am-decodable-simple.h  |  24 +-
 src/nnet3/nnet-compile-looped.h       |   7 -
 src/nnet3/nnet-computation.h          |   4 +-
 src/nnet3/nnet-compute-test.cc        |  70 +++++-
 src/nnet3/nnet-compute.cc             |  16 +-
 src/nnet3/nnet-compute.h              |   4 +-
 src/nnet3/nnet-graph.cc               |  24 +-
 src/nnet3/nnet-graph.h                |   8 +
 src/nnet3/nnet-optimize-utils.cc      | 222 ++++++++++++++++-
 src/nnet3/nnet-optimize-utils.h       | 184 +--------------
 src/nnet3/nnet-optimize.cc            | 117 ++++++---
 src/nnet3/nnet-optimize.h             |   5 +
 src/nnet3/nnet-test-utils.cc          |   2 +
 src/nnet3/nnet-test-utils.h           |   2 +
 src/nnet3/nnet-utils.cc               |   8 +
 src/nnet3/nnet-utils.h                |   3 +
 20 files changed, 1053 insertions(+), 272 deletions(-)
 create mode 100644 src/nnet3/decodable-simple-looped.cc
 create mode 100644 src/nnet3/decodable-simple-looped.h

diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 8dfa3120fac..ef50f9960e1 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -28,7 +28,8 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   discriminative-supervision.o nnet-discriminative-example.o \
   nnet-discriminative-diagnostics.o \
   discriminative-training.o nnet-discriminative-training.o \
-  online-nnet3-decodable-simple.o nnet-compile-looped.o
+  online-nnet3-decodable-simple.o nnet-compile-looped.o \
+  decodable-simple-looped.o
 
 
 LIBNAME = kaldi-nnet3
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
new file mode 100644
index 00000000000..9e580dc121f
--- /dev/null
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -0,0 +1,251 @@
+// nnet3/decodable-simple-looped.cc
+
+// Copyright      2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/decodable-simple-looped.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-compile-looped.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
+    const NnetSimpleLoopedComputationOptions &opts,
+    Nnet *nnet):
+    opts_(opts), nnet_(*nnet) {
+  Init(opts, nnet);
+}
+
+DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
+    const NnetSimpleLoopedComputationOptions &opts,
+    const Vector<BaseFloat> &priors,
+    Nnet *nnet):
+    opts_(opts), nnet_(*nnet), log_priors_(priors) {
+  if (log_priors_.Dim() != 0)
+    log_priors_.ApplyLog();
+  Init(opts, nnet);
+}
+
+
+DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
+    const NnetSimpleLoopedComputationOptions &opts,
+    AmNnetSimple *am_nnet):
+    opts_(opts), nnet_(am_nnet->GetNnet()), log_priors_(am_nnet->Priors()) {
+  if (log_priors_.Dim() != 0)
+    log_priors_.ApplyLog();
+  Init(opts, &(am_nnet->GetNnet()));
+}
+
+
+void DecodableNnetSimpleLoopedInfo::Init(
+    const NnetSimpleLoopedComputationOptions &opts,
+    Nnet *nnet) {
+  opts.Check();
+  KALDI_ASSERT(IsSimpleNnet(*nnet));
+  has_ivectors_ = (nnet->InputDim("ivector") > 0);
+  int32 left_context, right_context;
+  ComputeSimpleNnetContext(*nnet, &left_context, &right_context);
+  frames_left_context_ = left_context + opts.extra_left_context_initial;
+  frames_right_context_ = right_context;
+  frames_per_chunk_ = GetChunkSize(*nnet, opts_.frame_subsampling_factor,
+                                   opts.frames_per_chunk);
+  output_dim_ = nnet->OutputDim("output");
+  KALDI_ASSERT(output_dim_ > 0);
+  // note, ivector_period is hardcoded to the same as frames_per_chunk_.
+  int32 ivector_period = frames_per_chunk_;
+  if (has_ivectors_)
+    ModifyNnetIvectorPeriod(ivector_period, nnet);
+
+  ComputationRequest request1, request2, request3;
+  int32 num_sequences = 1;  // we're processing one utterance at a time.
+  int32 extra_right_context = 0;
+  CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk_,
+                                       opts_.frame_subsampling_factor,
+                                       ivector_period, opts.extra_left_context_initial,
+                                       extra_right_context,
+                                       num_sequences,
+                                       &request1, &request2, &request3);
+
+  CompileLooped(*nnet, opts_.optimize_config, request1, request2, request3,
+                &computation_);
+  computation_.ComputeCudaIndexes();
+  KALDI_LOG << "Computation is:";
+  computation_.Print(std::cerr, *nnet);
+}
+
+
+DecodableNnetSimpleLooped::DecodableNnetSimpleLooped(
+    const DecodableNnetSimpleLoopedInfo &info,
+    const MatrixBase<BaseFloat> &feats,
+    const VectorBase<BaseFloat> *ivector,
+    const MatrixBase<BaseFloat> *online_ivectors,
+    int32 online_ivector_period):
+    info_(info),
+    computer_(info_.opts_.compute_config, info_.computation_,
+              info_.nnet_, NULL),
+    feats_(feats),
+    ivector_(ivector), online_ivector_feats_(online_ivectors),
+    online_ivector_period_(online_ivector_period),
+    num_chunks_computed_(0),
+    current_log_post_subsampled_offset_(-1) {
+  num_subsampled_frames_ =
+      (feats_.NumRows() + info_.opts_.frame_subsampling_factor - 1) /
+      info_.opts_.frame_subsampling_factor;
+  KALDI_ASSERT(!(ivector != NULL && online_ivectors != NULL));
+  KALDI_ASSERT(!(online_ivectors != NULL && online_ivector_period <= 0 &&
+                 "You need to set the --online-ivector-period option!"));
+}
+
+
+void DecodableNnetSimpleLooped::GetOutputForFrame(
+    int32 subsampled_frame, VectorBase<BaseFloat> *output) {
+    KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ &&
+                 "Frames must be accessed in order.");
+    while (subsampled_frame >= current_log_post_subsampled_offset_ +
+                            current_log_post_.NumRows())
+      AdvanceChunk();
+    output->CopyFromVec(current_log_post_.Row(
+        subsampled_frame - current_log_post_subsampled_offset_));
+}
+
+int32 DecodableNnetSimpleLooped::GetIvectorDim() const {
+  if (ivector_ != NULL)
+    return ivector_->Dim();
+  else if (online_ivector_feats_ != NULL)
+    return online_ivector_feats_->NumCols();
+  else
+    return 0;
+}
+
+
+void DecodableNnetSimpleLooped::AdvanceChunk() {
+  int32 begin_input_frame, end_input_frame;
+  if (num_chunks_computed_ == 0) {
+    begin_input_frame = -info_.frames_left_context_;
+    // note: end is last plus one.
+    end_input_frame = info_.frames_per_chunk_ + info_.frames_right_context_;
+  } else {
+    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk_;
+    end_input_frame = begin_input_frame + info_.frames_per_chunk_;
+  }
+  CuMatrix<BaseFloat> feats_chunk(end_input_frame - begin_input_frame,
+                                  feats_.NumCols(), kUndefined);
+
+  int32 num_features = feats_.NumRows();
+  if (begin_input_frame >= 0 && end_input_frame <= num_features) {
+    SubMatrix<BaseFloat> this_feats(feats_,
+                                    begin_input_frame,
+                                    end_input_frame - begin_input_frame,
+                                    0, feats_.NumCols());
+    feats_chunk.CopyFromMat(this_feats);
+  } else {
+    Matrix<BaseFloat> this_feats(end_input_frame - begin_input_frame,
+                                 feats_.NumCols());
+    for (int32 r = begin_input_frame; r < end_input_frame; r++) {
+      int32 input_frame = r;
+      if (input_frame < 0) input_frame = 0;
+      if (input_frame >= num_features) input_frame = num_features - 1;
+      this_feats.Row(r - begin_input_frame).CopyFromVec(
+          feats_.Row(input_frame));
+    }
+    feats_chunk.CopyFromMat(this_feats);
+  }
+  computer_.AcceptInput("input", &feats_chunk);
+
+  if (info_.has_ivectors_) {
+    Vector<BaseFloat> ivector;
+    GetCurrentIvector(end_input_frame, &ivector);
+    CuMatrix<BaseFloat> cu_ivector(1, ivector.Dim());
+    cu_ivector.Row(0).CopyFromVec(ivector);
+    computer_.AcceptInput("ivector", &cu_ivector);
+  }
+  computer_.Run();
+
+  {
+    // on GPU if we're using one, while avoiding unnecessary copies if we're not
+    // using the GPU.
+
+    // Note: it's possible in theory that if you had weird recurrence that went
+    // directly from the output, the call to GetOutputDestructive() would cause
+    // a crash on the next chunk.  But we don't anticipate this will happen in
+    // practice.
+    CuMatrix<BaseFloat> output;
+    computer_.GetOutputDestructive("output", &output);
+
+    if (info_.log_priors_.Dim() != 0) {
+      // subtract log-prior (divide by prior)
+      output.AddVecToRows(-1.0, info_.log_priors_);
+    }
+    // apply the acoustic scale
+    output.Scale(info_.opts_.acoustic_scale);
+    current_log_post_.Resize(0, 0);
+    current_log_post_.Swap(&output);
+  }
+  KALDI_ASSERT(current_log_post_.NumRows() == info_.frames_per_chunk_ /
+               info_.opts_.frame_subsampling_factor &&
+               current_log_post_.NumCols() == info_.output_dim_);
+
+  num_chunks_computed_++;
+
+  current_log_post_subsampled_offset_ =
+      (num_chunks_computed_ - 1) *
+      (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor);
+}
+
+
+void DecodableNnetSimpleLooped::GetCurrentIvector(int32 input_frame,
+                                                  Vector<BaseFloat> *ivector) {
+  if (!info_.has_ivectors_)
+    return;
+  if (ivector_ != NULL) {
+    *ivector = *ivector_;
+    return;
+  } else if (online_ivector_feats_ == NULL) {
+    KALDI_ERR << "Neural net expects iVectors but none provided.";
+  }
+  KALDI_ASSERT(online_ivector_period_ > 0);
+  int32 ivector_frame = input_frame / online_ivector_period_;
+  KALDI_ASSERT(ivector_frame >= 0);
+  if (ivector_frame >= online_ivector_feats_->NumRows())
+    ivector_frame = online_ivector_feats_->NumRows() - 1;
+  KALDI_ASSERT(ivector_frame >= 0 && "ivector matrix cannot be empty.");
+  *ivector = online_ivector_feats_->Row(ivector_frame);
+}
+
+
+DecodableAmNnetSimpleLooped::DecodableAmNnetSimpleLooped(
+    const DecodableNnetSimpleLoopedInfo &info,
+    const TransitionModel &trans_model,
+    const MatrixBase<BaseFloat> &feats,
+    const VectorBase<BaseFloat> *ivector,
+    const MatrixBase<BaseFloat> *online_ivectors,
+    int32 online_ivector_period):
+    decodable_nnet_(info, feats, ivector, online_ivectors, online_ivector_period),
+    trans_model_(trans_model) { }
+
+BaseFloat DecodableAmNnetSimpleLooped::LogLikelihood(int32 frame,
+                                                     int32 transition_id) {
+  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  return decodable_nnet_.GetOutput(frame, pdf_id);
+}
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/decodable-simple-looped.h b/src/nnet3/decodable-simple-looped.h
new file mode 100644
index 00000000000..fe40c220f8f
--- /dev/null
+++ b/src/nnet3/decodable-simple-looped.h
@@ -0,0 +1,328 @@
+// nnet3/decodable-simple-looped.h
+
+// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
+#define KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
+
+#include <vector>
+#include "base/kaldi-common.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "itf/decodable-itf.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/am-nnet-simple.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// See also nnet-am-decodable-simple.h, which is a decodable object that's based
+// on breaking up the input into fixed chunks.  The decodable object defined here is based on
+// 'looped' computations, which naturally handle infinite left-context (but are
+// only ideal for systems that have only recurrence in the forward direction,
+// i.e. not BLSTMs... because there isn't a natural way to enforce extra right
+// context for each chunk.)
+
+
+// Note: the 'simple' in the name means it applies to networks for which
+// IsSimpleNnet(nnet) would return true.  'looped' means we use looped
+// computations, with a kGotoLabel statement at the end of it.
+struct NnetSimpleLoopedComputationOptions {
+  int32 extra_left_context_initial;
+  int32 frame_subsampling_factor;
+  int32 frames_per_chunk;
+  BaseFloat acoustic_scale;
+  bool debug_computation;
+  NnetOptimizeOptions optimize_config;
+  NnetComputeOptions compute_config;
+
+  NnetSimpleLoopedComputationOptions():
+      extra_left_context_initial(0),
+      frame_subsampling_factor(1),
+      frames_per_chunk(20),
+      acoustic_scale(0.1),
+      debug_computation(false) { }
+
+  void Check() const {
+    KALDI_ASSERT(extra_left_context_initial >= 0 &&
+                 frame_subsampling_factor > 0 && frames_per_chunk > 0 &&
+                 acoustic_scale > 0.0);
+  }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("extra-left-context-initial", &extra_left_context_initial,
+                   "Extra left context to use at the first frame of an utterance (note: "
+                   "this will just consist of repeats of the first frame, and should not "
+                   "usually be necessary.");
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
+                   "Required if the frame-rate of the output (e.g. in 'chain' "
+                   "models) is less than the frame-rate of the original "
+                   "alignment.");
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic log-likelihoods");
+    opts->Register("frames-per-chunk", &frames_per_chunk,
+                   "Number of frames in each chunk that is separately evaluated "
+                   "by the neural net.  Measured before any subsampling, if the "
+                   "--frame-subsampling-factor options is used (i.e. counts "
+                   "input frames.  This is only advisory (may be rounded up "
+                   "if needed.");
+    opts->Register("debug-computation", &debug_computation, "If true, turn on "
+                   "debug for the actual computation (very verbose!)");
+
+    // register the optimization options with the prefix "optimization".
+    ParseOptions optimization_opts("optimization", opts);
+    optimize_config.Register(&optimization_opts);
+
+    // register the compute options with the prefix "computation".
+    ParseOptions compute_opts("computation", opts);
+    compute_config.Register(&compute_opts);
+  }
+};
+
+// forward declaration.
+class DecodableNnetSimpleLooped;
+
+
+/**
+   When you instantiate class DecodableNnetSimpleLooped, you should give it
+   a const reference to this class, that has been previously initialized.
+ */
+class DecodableNnetSimpleLoopedInfo  {
+ public:
+  // The constructor takes a non-const pointer to 'nnet' because it may have to
+  // modify it to be able to take multiple iVectors.
+  DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
+                                Nnet *nnet);
+
+  DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
+                                AmNnetSimple *nnet);
+
+  // this constructor is for use in testing.
+  DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
+                                const Vector<BaseFloat> &priors,
+                                Nnet *nnet);
+
+ protected:
+  void Init(const NnetSimpleLoopedComputationOptions &opts,
+            Nnet *nnet);
+
+  friend class DecodableNnetSimpleLooped;
+
+
+  const NnetSimpleLoopedComputationOptions &opts_;
+  const Nnet &nnet_;
+
+  // the log priors (or the empty vector if the priors are not set in the model)
+  CuVector<BaseFloat> log_priors_;
+
+
+  // frames_left_context equals the model left context plus any extra left context.
+  int32 frames_left_context_;
+  // frames_right_context is the same as the right-context of the model.
+  int32 frames_right_context_;
+  // The frames_per_chunk_ equals the number of input frames we need for each
+  // chunk (except for the first chunk).  This divided by
+  // opts_.frame_subsampling_factor gives the number of output frames.
+  int32 frames_per_chunk_;
+
+  // The output dimension of the neural network.
+  int32 output_dim_;
+
+  // True if the neural net accepts iVectors.  If so, the neural net will have been modified
+  // to accept the iVectors
+  bool has_ivectors_;
+
+  // The compiled, 'looped' computation.
+  NnetComputation computation_;
+};
+
+/*
+  This class handles the neural net computation; it's mostly accessed
+  via other wrapper classes.
+
+  It can accept just input features, or input features plus iVectors.  */
+class DecodableNnetSimpleLooped {
+ public:
+  /**
+     This constructor takes features as input, and you can either supply a
+     single iVector input, estimated in batch-mode ('ivector'), or 'online'
+     iVectors ('online_ivectors' and 'online_ivector_period', or none at all.
+     Note: it stores references to all arguments to the constructor, so don't
+     delete them till this goes out of scope.
+
+     @param [in] info   This helper class contains all the static pre-computed information
+                        this class needs, and contains a pointer to the neural net.
+     @param [in] feats  The input feature matrix.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] online_ivectors
+                        If you are using iVectors estimated 'online'
+                        a pointer to the iVectors, else NULL.
+     @param [in] online_ivector_period If you are using iVectors estimated 'online'
+                        (i.e. if online_ivectors != NULL) gives the periodicity
+                        (in frames) with which the iVectors are estimated.
+  */
+  DecodableNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info,
+                            const MatrixBase<BaseFloat> &feats,
+                            const VectorBase<BaseFloat> *ivector = NULL,
+                            const MatrixBase<BaseFloat> *online_ivectors = NULL,
+                            int32 online_ivector_period = 1);
+
+
+  // returns the number of frames of likelihoods.  The same as feats_.NumRows()
+  // in the normal case (but may be less if opts_.frame_subsampling_factor !=
+  // 1).
+  inline int32 NumFrames() const { return num_subsampled_frames_; }
+
+  inline int32 OutputDim() const { return info_.output_dim_; }
+
+  // Gets the output for a particular frame, with 0 <= frame < NumFrames().
+  // 'output' must be correctly sized (with dimension OutputDim()).  Note:
+  // you're expected to call this, and GetOutput(), in an order of increasing
+  // frames.  If you deviate from this, one of these calls may crash.
+  void GetOutputForFrame(int32 subsampled_frame,
+                         VectorBase<BaseFloat> *output);
+
+  // Gets the output for a particular frame and pdf_id, with
+  // 0 <= subsampled_frame < NumFrames(),
+  // and 0 <= pdf_id < OutputDim().
+  inline BaseFloat GetOutput(int32 subsampled_frame, int32 pdf_id) {
+    KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ &&
+                 "Frames must be accessed in order.");
+    while (subsampled_frame >= current_log_post_subsampled_offset_ +
+                            current_log_post_.NumRows())
+      AdvanceChunk();
+    return current_log_post_(subsampled_frame -
+                             current_log_post_subsampled_offset_,
+                             pdf_id);
+  }
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnetSimpleLooped);
+
+  // This function does the computation for the next chunk.
+  void AdvanceChunk();
+
+  void AdvanceChunkInternal(const MatrixBase<BaseFloat> &input_feats,
+                            const VectorBase<BaseFloat> &ivector);
+
+  // Gets the iVector for the specified frame., if we are
+  // using iVectors (else does nothing).
+  void GetCurrentIvector(int32 input_frame,
+                         Vector<BaseFloat> *ivector);
+
+  // returns dimension of the provided iVectors if supplied, or 0 otherwise.
+  int32 GetIvectorDim() const;
+
+  const DecodableNnetSimpleLoopedInfo &info_;
+
+  NnetComputer computer_;
+
+  const MatrixBase<BaseFloat> &feats_;
+  // note: num_subsampled_frames_ will equal feats_.NumRows() in the normal case
+  // when opts_.frame_subsampling_factor == 1.
+  int32 num_subsampled_frames_;
+
+  // ivector_ is the iVector if we're using iVectors that are estimated in batch
+  // mode.
+  const VectorBase<BaseFloat> *ivector_;
+
+  // online_ivector_feats_ is the iVectors if we're using online-estimated ones.
+  const MatrixBase<BaseFloat> *online_ivector_feats_;
+  // online_ivector_period_ helps us interpret online_ivector_feats_; it's the
+  // number of frames the rows of ivector_feats are separated by.
+  int32 online_ivector_period_;
+
+  // The current log-posteriors that we got from the last time we
+  // ran the computation.
+  Matrix<BaseFloat> current_log_post_;
+
+  // The number of chunks we have computed so far.
+  int32 num_chunks_computed_;
+
+  // The time-offset of the current log-posteriors, equals
+  // (num_chunks_computed_ - 1) *
+  //    (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor).
+  int32 current_log_post_subsampled_offset_;
+};
+
+class DecodableAmNnetSimpleLooped: public DecodableInterface {
+ public:
+  /**
+     This constructor takes features as input, and you can either supply a
+     single iVector input, estimated in batch-mode ('ivector'), or 'online'
+     iVectors ('online_ivectors' and 'online_ivector_period', or none at all.
+     Note: it stores references to all arguments to the constructor, so don't
+     delete them till this goes out of scope.
+
+
+     @param [in] info   This helper class contains all the static pre-computed information
+                        this class needs, and contains a pointer to the neural net.  If
+                        you want prior subtraction to be done, you should have initialized
+                        this with the constructor that takes class AmNnetSimple.
+     @param [in] trans_model  The transition model to use.  This takes care of the
+                        mapping from transition-id (which is an arg to
+                        LogLikelihood()) to pdf-id (which is used internally).
+     @param [in] feats   A pointer to the input feature matrix; must be non-NULL.
+                         We
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] online_ivectors
+                        If you are using iVectors estimated 'online'
+                        a pointer to the iVectors, else NULL.
+     @param [in] online_ivector_period If you are using iVectors estimated 'online'
+                        (i.e. if online_ivectors != NULL) gives the periodicity
+                        (in frames) with which the iVectors are estimated.
+  */
+  DecodableAmNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info,
+                              const TransitionModel &trans_model,
+                              const MatrixBase<BaseFloat> &feats,
+                              const VectorBase<BaseFloat> *ivector = NULL,
+                              const MatrixBase<BaseFloat> *online_ivectors = NULL,
+                              int32 online_ivector_period = 1);
+
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id);
+
+  virtual inline int32 NumFramesReady() const {
+    return decodable_nnet_.NumFrames();
+  }
+
+  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
+
+  virtual bool IsLastFrame(int32 frame) const {
+    KALDI_ASSERT(frame < NumFramesReady());
+    return (frame == NumFramesReady() - 1);
+  }
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetSimpleLooped);
+  DecodableNnetSimpleLooped decodable_nnet_;
+  const TransitionModel &trans_model_;
+};
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif  // KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 9116c9461ac..9d2176965b1 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -276,32 +276,37 @@ void DecodableNnetSimple::DoNnetComputation(
 }
 
 void DecodableNnetSimple::CheckAndFixConfigs() {
-  static bool warned_modulus = false,
-      warned_subsampling = false;
+  static bool warned_frames_per_chunk = false;
   int32 nnet_modulus = nnet_.Modulus();
   if (opts_.frame_subsampling_factor < 1 ||
       opts_.frames_per_chunk < 1)
     KALDI_ERR << "--frame-subsampling-factor and --frames-per-chunk must be > 0";
-  if (opts_.frames_per_chunk % opts_.frame_subsampling_factor != 0) {
-    int32 f = opts_.frame_subsampling_factor,
-        frames_per_chunk = f * ((opts_.frames_per_chunk + f - 1) / f);
-    if (!warned_subsampling) {
-      warned_subsampling = true;
-      KALDI_LOG << "Increasing --frames-per-chunk from "
-                << opts_.frames_per_chunk << " to "
-                << frames_per_chunk << " to make it a multiple of "
-                << "--frame-subsampling-factor="
-                << opts_.frame_subsampling_factor;
+  KALDI_ASSERT(nnet_modulus > 0);
+  int32 n = Lcm(opts_.frame_subsampling_factor, nnet_modulus);
+
+  if (opts_.frames_per_chunk % n != 0) {
+    // round up to the nearest multiple of n.
+    int32 frames_per_chunk = n * ((opts_.frames_per_chunk + n - 1) / n);
+    if (!warned_frames_per_chunk) {
+      warned_frames_per_chunk = true;
+      if (nnet_modulus == 1) {
+        // simpler error message.
+        KALDI_LOG << "Increasing --frames-per-chunk from "
+                  << opts_.frames_per_chunk << " to "
+                  << frames_per_chunk << " to make it a multiple of "
+                  << "--frame-subsampling-factor="
+                  << opts_.frame_subsampling_factor;
+      } else {
+        KALDI_LOG << "Increasing --frames-per-chunk from "
+                  << opts_.frames_per_chunk << " to "
+                  << frames_per_chunk << " due to "
+                  << "--frame-subsampling-factor="
+                  << opts_.frame_subsampling_factor << " and "
+                  << "nnet shift-invariance modulus = " << nnet_modulus;
+      }
     }
     opts_.frames_per_chunk = frames_per_chunk;
   }
-  if (opts_.frames_per_chunk % nnet_modulus != 0 && !warned_modulus) {
-    warned_modulus = true;
-    KALDI_WARN << "It may be more efficient to set the --frames-per-chunk "
-               << "(currently " << opts_.frames_per_chunk << " to a "
-               << "multiple of the network's shift-invariance modulus "
-               << nnet_modulus;
-  }
 }
 
 
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index 5f7a0307abe..e604765e09a 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -33,6 +33,11 @@ namespace kaldi {
 namespace nnet3 {
 
 
+// See also the decodable object in decodable-simple-looped.h, which is better
+// and faster in most situations, including TDNNs and LSTMs (but not for
+// BLSTMs).
+
+
 // Note: the 'simple' in the name means it applies to networks
 // for which IsSimpleNnet(nnet) would return true.
 struct NnetSimpleComputationOptions {
@@ -251,9 +256,11 @@ class DecodableAmNnetSimple: public DecodableInterface {
      @param [in] opts   The options class.  Warning: it includes an acoustic
                         weight, whose default is 0.1; you may sometimes want to
                         change this to 1.0.
-     @param [in] nnet   The neural net that we're going to do the computation with
-     @param [in] priors Vector of priors-- if supplied and nonempty, we subtract
-                        the log of these priors from the nnet output.
+     @param [in] trans_model  The transition model to use.  This takes care of the
+                        mapping from transition-id (which is an arg to
+                        LogLikelihood()) to pdf-id (which is used internally).
+     @param [in] am_nnet   The neural net that we're going to do the computation with;
+                         we also get the priors to divide by, if applicable, from here.
      @param [in] feats   A pointer to the input feature matrix; must be non-NULL.
                          We
      @param [in] ivector If you are using iVectors estimated in batch mode,
@@ -329,13 +336,12 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
      @param [in] opts   The options class.  Warning: it includes an acoustic
                         weight, whose default is 0.1; you may sometimes want to
                         change this to 1.0.
-     @param [in] nnet   The neural net that we're going to do the computation with
-     @param [in] priors Vector of priors-- if supplied and nonempty, we subtract
-                        the log of these priors from the nnet output.
+     @param [in] trans_model  The transition model to use.  This takes care of the
+                        mapping from transition-id (which is an arg to
+                        LogLikelihood()) to pdf-id (which is used internally).
+     @param [in] am_nnet The neural net that we're going to do the computation with;
+                        it may provide priors to divide by.
      @param [in] feats   A pointer to the input feature matrix; must be non-NULL.
-                         We
-     @param [in] ivector If you are using iVectors estimated in batch mode,
-                         a pointer to the iVector, else NULL.
      @param [in] ivector If you are using iVectors estimated in batch mode,
                          a pointer to the iVector, else NULL.
      @param [in] online_ivectors
diff --git a/src/nnet3/nnet-compile-looped.h b/src/nnet3/nnet-compile-looped.h
index 00a97292798..f6ff47045fe 100644
--- a/src/nnet3/nnet-compile-looped.h
+++ b/src/nnet3/nnet-compile-looped.h
@@ -163,13 +163,6 @@ void CreateLoopedComputationRequestSimple(const Nnet &nnet,
                                           ComputationRequest *request2,
                                           ComputationRequest *request3);
 
-struct NnetSimpleLoopedComputationOptions {
-  // TODO
-};
-
-void CreateLoopedComputationSimple(
-    const Nnet &nnet // ... TODO...
-                                   );
 
 
 
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 857dde1547b..da3a43bd15f 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -168,7 +168,9 @@ struct ComputationRequest {
    - kAllocMatrixZeroed: Allocate and zero a matrix.  arg1 = submatrix index.
    - kDeallocMatrix: Deallocate a matrix.  arg1 = submatrix index.
    - kAllocMatrixFromOther: initialize matrix with submatrix index arg1 using memory
-     from matrix with submatrix index arg2 (using shallow swap).
+     from matrix with submatrix index arg2 (using shallow swap).  Note: the
+     code relating to the 'looped' computation relies on the fact that this is
+     a swap, so kSwapMatrix might be a better name, but we're keeping the old name.
    - kAllocMatrixFromOtherZeroed: initialize matrix with submatrix index arg1 using memory
      from matrix with submatrix index arg2 (using shallow swap), then zero the matrix
      we just allocated.
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index c485cc06636..81cc67f71ae 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -24,6 +24,8 @@
 #include "nnet3/nnet-test-utils.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/decodable-simple-looped.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -71,13 +73,64 @@ void UnitTestComputationRequestIo(ComputationRequest *request) {
   }
 }
 
-void TestNnetDecodable(const ComputationRequest &request,
-                       const std::vector<Matrix<BaseFloat> > &inputs,
-                       const Nnet &nnet,
-                       const CuMatrixBase<BaseFloat> &reference_output) {
-  // DecodableAmNnetSimpleOptions opts;
-  // This is a placeholder for where we'll eventually test either the decodable
-  // object or something similar to it (e.g. a base class)
+// this checks that a couple of different decodable objects give the same
+// answer.
+void TestNnetDecodable(Nnet *nnet) {
+  int32 num_frames = 5 + RandInt(1, 100),
+      input_dim = nnet->InputDim("input"),
+      output_dim = nnet->OutputDim("output"),
+      ivector_dim = std::max<int32>(0, nnet->InputDim("ivector"));
+  Matrix<BaseFloat> input(num_frames, input_dim);
+
+
+  input.SetRandn();
+  Vector<BaseFloat> ivector(ivector_dim);
+  ivector.SetRandn();
+
+  Vector<BaseFloat> priors(RandInt(0, 1) == 0 ? output_dim : 0);
+  if (priors.Dim() != 0) {
+    priors.SetRandn();
+    priors.ApplyExp();
+  }
+
+  Matrix<BaseFloat> output1(num_frames, output_dim),
+      output2(num_frames, output_dim);
+
+  {
+    NnetSimpleComputationOptions opts;
+    opts.frames_per_chunk = RandInt(5, 25);
+    CachingOptimizingCompiler compiler(*nnet);
+    DecodableNnetSimple decodable(opts, *nnet, priors, input, &compiler,
+                                  (ivector_dim != 0 ? &ivector : NULL));
+    for (int32 t = 0; t < num_frames; t++) {
+      SubVector<BaseFloat> row(output1, t);
+      decodable.GetOutputForFrame(t, &row);
+    }
+  }
+
+  {
+    NnetSimpleLoopedComputationOptions opts;
+    // caution: this may modify nnet, by changing how it consumes iVectors.
+    DecodableNnetSimpleLoopedInfo info(opts, priors, nnet);
+    DecodableNnetSimpleLooped decodable(info, input,
+                                        (ivector_dim != 0 ? &ivector : NULL));
+    for (int32 t = 0; t < num_frames; t++) {
+      SubVector<BaseFloat> row(output2, t);
+      decodable.GetOutputForFrame(t, &row);
+    }
+  }
+
+
+  if (!NnetIsRecurrent(*nnet) &&
+      nnet->Info().find("statistics-extraction") == std::string::npos) {
+    // this equivalence will not hold for recurrent nnets or those that
+    // have the statistics-extraction/statistics-pooling layers.
+    for (int32 t = 0; t < num_frames; t++) {
+      SubVector<BaseFloat> row1(output1, t),
+          row2(output2, t);
+      KALDI_ASSERT(row1.ApproxEqual(row2));
+    }
+  }
 }
 
 void UnitTestNnetCompute() {
@@ -145,8 +198,6 @@ void UnitTestNnetCompute() {
     computer.Run();
     const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
 
-    TestNnetDecodable(request, inputs, nnet, output);
-
     KALDI_LOG << "Output sum is " << output.Sum();
     CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
     output_deriv.SetRandn();
@@ -163,6 +214,7 @@ void UnitTestNnetCompute() {
         }
       }
     }
+    TestNnetDecodable(&nnet);
   }
 }
 
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 7171e6b0273..75c0c464c90 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -145,8 +145,8 @@ void NnetComputer::DebugAfterExecute(int32 command,
 }
 
 
-void NnetComputer::ExecuteCommand(int32 command) {
-  const NnetComputation::Command &c = computation_.commands[command];
+void NnetComputer::ExecuteCommand() {
+  const NnetComputation::Command &c = computation_.commands[program_counter_];
   int32 m1, m2;
   try {
     switch (c.command_type) {
@@ -279,7 +279,11 @@ void NnetComputer::ExecuteCommand(int32 command) {
         dest.AddRowRanges(src, pairs);
         break;
       }
-      case kNoOperation: case kNoOperationMarker:
+      case kNoOperation: case kNoOperationMarker: case kNoOperationLabel:
+        break;
+      case kGotoLabel:
+        KALDI_ASSERT(computation_.commands[c.arg1].command_type == kNoOperationLabel);
+        program_counter_ = c.arg1;
         break;
       default:
         KALDI_ERR << "Invalid command in computation";
@@ -290,12 +294,12 @@ void NnetComputer::ExecuteCommand(int32 command) {
       computation_.GetCommandStrings(nnet_, &preamble, &command_strings_);
       KALDI_WARN << "Printing some background info since error was detected";
       KALDI_LOG << preamble;
-      for (int32 prev_c = 0; prev_c < command; prev_c++)
+      for (int32 prev_c = 0; prev_c < program_counter_; prev_c++)
         KALDI_LOG << command_strings_[prev_c];
     }
     // the following will re-throw the error, but now we've printed more info
     // about what went wrong.
-    KALDI_ERR << "Error running command " << command_strings_[command];
+    KALDI_ERR << "Error running command " << command_strings_[program_counter_];
   }
 }
 
@@ -381,7 +385,7 @@ void NnetComputer::Run() {
     }
     if (debug_)
       DebugBeforeExecute(program_counter_, &info);
-    ExecuteCommand(program_counter_);
+    ExecuteCommand();
     if (debug_) {
       double total_elapsed_now = timer.Elapsed();
       DebugAfterExecute(program_counter_, info,
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index 32839755828..0f7da2e01be 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -129,8 +129,8 @@ class NnetComputer {
   std::vector<CuMatrix<BaseFloat> > matrices_;
 
 
-  // executes the command in computation_.commands[command].
-  void ExecuteCommand(int32 command);
+  // executes the command in computation_.commands[program_counter_].
+  void ExecuteCommand();
 
   // Returns the matrix index where the input (if is_output==false) or output
   // matrix index for "node_name" is stored.  This looks at the next command (at
diff --git a/src/nnet3/nnet-graph.cc b/src/nnet3/nnet-graph.cc
index e66a34fc26a..a0216b9189f 100644
--- a/src/nnet3/nnet-graph.cc
+++ b/src/nnet3/nnet-graph.cc
@@ -39,7 +39,7 @@ void NnetToDirectedGraph(const Nnet &nnet,
     switch (node.node_type) {
       case kInput:
         break;  // no node dependencies.
-      case kDescriptor: 
+      case kDescriptor:
         node.descriptor.GetNodeDependencies(&node_dependencies);
         break;
       case kComponent:
@@ -265,7 +265,7 @@ std::string PrintGraphToString(const std::vector<std::vector<int32> > &graph) {
 void ComputeNnetComputationEpochs(const Nnet &nnet,
                                   std::vector<int32> *node_to_epoch) {
   KALDI_ASSERT(node_to_epoch != NULL);
-  
+
   std::vector<std::vector<int32> > graph;
   NnetToDirectedGraph(nnet, &graph);
   KALDI_VLOG(6) << "graph is: " << PrintGraphToString(graph);
@@ -276,7 +276,7 @@ void ComputeNnetComputationEpochs(const Nnet &nnet,
   std::vector<std::vector<int32> > scc_graph;
   MakeSccGraph(graph, sccs, &scc_graph);
   KALDI_VLOG(6) << "scc graph is: " << PrintGraphToString(scc_graph);
-  
+
   std::vector<int32> scc_node_to_epoch;
   ComputeTopSortOrder(scc_graph, &scc_node_to_epoch);
   if (GetVerboseLevel() >= 6) {
@@ -285,7 +285,7 @@ void ComputeNnetComputationEpochs(const Nnet &nnet,
       os << scc_node_to_epoch[i] << ", ";
     KALDI_VLOG(6) << "scc_node_to_epoch is: " << os.str();
   }
-  
+
   node_to_epoch->clear();
   node_to_epoch->resize(graph.size());
   for (int32 i = 0; i < sccs.size(); ++i) {
@@ -297,5 +297,21 @@ void ComputeNnetComputationEpochs(const Nnet &nnet,
   }
 }
 
+bool GraphHasCycles(const std::vector<std::vector<int32> > &graph) {
+  std::vector<std::vector<int32> > sccs;
+  FindSccs(graph, &sccs);
+  for (size_t i = 0; i < sccs.size(); i++) {
+    if (sccs[i].size() > 1)
+      return true;
+  }
+  // the next code checks for links from a state to itself.
+  int32 num_nodes = graph.size();
+  for (size_t i = 0; i < num_nodes; i++)
+    for (std::vector<int32>::const_iterator iter = graph[i].begin(),
+             end = graph[i].end(); iter != end; ++iter)
+      if (*iter == i) return true;
+  return false;
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-graph.h b/src/nnet3/nnet-graph.h
index 24c26176742..27e3fd609f3 100644
--- a/src/nnet3/nnet-graph.h
+++ b/src/nnet3/nnet-graph.h
@@ -55,10 +55,18 @@ void NnetToDirectedGraph(const Nnet &nnet,
 /// of destination-nodes of arcs coming from the current node),
 /// partition it into strongly connected components (i.e. within
 /// each SCC, all nodes are reachable from all other nodes).
+/// Each element of 'sccs' is a list of node indexes that are
+/// in that scc.
 void FindSccs(const std::vector<std::vector<int32> > &graph,
               std::vector<std::vector<int32> > *sccs);
 
 
+/// This function returns 'true' if the graph represented in 'graph'
+/// contains cycles (including cycles where a single node has an arc
+/// to itself).
+bool GraphHasCycles(const std::vector<std::vector<int32> > &graph);
+
+
 /// Given a list of sccs of a graph (e.g. as computed by FindSccs), compute a
 /// directed graph on the sccs.  Of course this directed graph will be acyclic.
 void MakeSccGraph(const std::vector<std::vector<int32> > &graph,
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 4f9d3ec078c..4d199d4a0d6 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -147,7 +147,112 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
   }
 }
 
+// We declare this class in the .cc file, we don't need to export it.
+// It's used inside RenumberComputation.
+class ComputationRenumberer {
+ public:
+  ComputationRenumberer(NnetComputation *computation):
+      computation_(computation) { }
 
+  void Renumber();
+ private:
+  // this function removes unused vectors within the indexes_multi_ array, i.e.
+  // ones that are not referenced in the computation.
+  void RemoveUnusedIndexesMulti();
+  // this function computes the submatrix_is_used_ vector, saying whether each
+  // of the original submatrices is referenced somewhere.
+  void ComputeSubmatrixIsUsed();
+  // this function computes the matrix_is_used_ vector (from the
+  // submatrix_is_used_ vector, from computation_->input_output_info, and from
+  // computation_->commands, saying whether each of the original matrices is
+  // referenced somewhere, directly or indirectly.
+  void ComputeMatrixIsUsed();
+  // This function sets up mappings from old to new matrix and submatrix indexes,
+  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
+  void SetUpMappings();
+  // This function renumbers submatrix indexes appearing within commands and
+  // indexes_multi_, and then removes unused submatrices from the list of
+  // submatrices while leaving the matrix-indexes at their old values (they will
+  // be mapped by RenumberMatrices()).
+  void RenumberSubmatrices();
+  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
+  // and 'input_output_info'; renumber 'matrices' and if applicable
+  // 'debug_info'.
+  void RenumberMatrices();
+  // removes duplicates within the indexes_multi array itself.
+  void RemoveIndexesMultiDuplicates();
+  // removes unused elements and duplicates within 'computation->indexes'
+  void RenumberIndexes();
+  // removes unused elements and duplicates within 'computation->indexes_ranges'
+  void RenumberIndexesRanges();
+
+  struct SubMatrixHasher {
+    SubMatrixHasher() { }
+    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
+      // these numbers are arbitrarily chosen primes.
+      return submat.matrix_index +
+          19553 * submat.row_offset +
+          29297 * submat.num_rows +
+          42209 * submat.col_offset +
+          56527 * submat.num_cols;
+    }
+  };
+
+
+  // Here, T will be int32 or std::pair<int32,int32>
+  template <class T>
+  struct PointerCompare {
+    // This provides an operator < on two vectors of ints or pairs of ints.  It
+    // is designed to provide a total order on the vectors while accessing as
+    // small a portion of the vectors' data as possible.  It's used in removing
+    // duplicates from computation_->indexes_multi and computation_->indexes.
+    // First it compares the length, then it does lexicographical compare.
+    bool operator ()(const std::vector<T> *ptr1,
+                     const std::vector<T> *ptr2) const {
+      size_t size1 = ptr1->size(), size2 = ptr2->size();
+      if (size1 < size2) return true;
+      else if (size1 > size2) return false;
+      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
+                                    // lexicographical comparison.
+    }
+  };
+
+  /// creates a renumbering that removes the elements in "to_remove",
+  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
+  /// the vector [ 0, -1, 1 ].
+  static void CreateRenumbering(int32 old_num_elements,
+                                const std::vector<int32> &to_remove,
+                                std::vector<int32> *renumbering);
+
+  /// creates a renumbering from old to new index that removes the unused
+  /// elements, e.g. if used == [ true, false, true, true], would output the
+  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
+  /// number of elements of 'used' that were true.
+  static int32 CreateRenumbering(const std::vector<bool> &used,
+                                 std::vector<int32> *renumbering);
+
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index is used somewhere in the computation (always true for
+  // the zeroth element).
+  std::vector<bool> submatrix_is_used_;
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index will be kept; this is like submatrix_is_used_; but for
+  // duplicate submatrices, all but the first duplicate will be marked false).
+  std::vector<bool> submatrix_is_kept_;
+  // vector of bool indexed by original-matrix-index > 0, that is true if a
+  // matrix-index is used somewhere in the computation, directly or indirectly.
+  // always true for the zeroth element.
+  std::vector<bool> matrix_is_used_;
+  NnetComputation *computation_;
+  int32 num_matrices_new_;
+  int32 num_submatrices_new_;
+  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
+                                         // new-matrix-index.  -1 for removed
+                                         // ones.
+  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
+                                            // gives new-submatrix-index.  -1
+                                            // for removed ones.
+};
 
 // static
 int32 ComputationRenumberer::CreateRenumbering(
@@ -547,6 +652,7 @@ void RenumberComputation(NnetComputation *computation) {
   renumberer.Renumber();
 }
 
+
 void RemoveNoOps(NnetComputation *computation) {
   std::vector<NnetComputation::Command>::iterator
       input_iter = computation->commands.begin(),
@@ -844,6 +950,77 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
   return std::pair<bool,bool>(false,false);
 }
 
+
+/** This class is responsible for consolidating the model-update part of
+    backprop commands, for components in (e.g.) recurrent networks that need to
+    have many separate backprop commands, into more efficient single commands
+    operating on consolidated data in larger matrices.  This is useful for
+    recurrent networks.  */
+class ModelUpdateConsolidator {
+ public:
+  ModelUpdateConsolidator(const Nnet &nnet,
+                          NnetComputation *computation);
+  void ConsolidateModelUpdate();
+ private:
+  void ConsolidateUpdateForComponent(
+      int32 component,
+      const std::vector<int32> &backprop_commands);
+
+  /// This function, called at the end of ConsolidateModelUpdate(), takes the
+  /// commands that we have put in extra_commands_, final_commands_ and
+  /// final_deallocate_commands_, and puts them in the appropriate place in
+  /// computation->commands_.
+  void AddCommandsToComputation();
+
+  /// You call this function when you want to consolidate the values of a list
+  /// of submatrices taken just prior to particular commands.  The input
+  /// 'commands' and 'submatrices' lists must be the same size, and size must be
+  /// > 1.  This function will create a new matrix that is the row-wise
+  /// concatentation of all these submatrices, with values taken just prior to
+  /// the respective command indexes.  This function will will add to
+  /// extra_commands_ the commands to do the copying at the appropriate places
+  /// (at the supplied command indexes; they will be inserted just before).  The
+  /// return value is the submatrix index of a submatrix that represents the
+  /// whole of the consolidated matrix.  This command will insert, at the
+  /// beginning of the computation (in extra_commands_[0]), a command to
+  /// initialize the matrix; and will append to final_deallocate_commands_ the
+  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
+  /// nonempty, this function will also update computation_->matrix_debug_info
+  /// with suitable values for the newly added matrix
+  int32 ConsolidateSubmatrices(
+      const std::vector<int32> &commands,
+      const std::vector<int32> &submatrices);
+
+  /// This function, called from ConsolidateSubmatrices, will
+  /// update 'debug_info' by appending the corresponding 'indexes' from
+  /// the existing debug info for this submatrix.  It will also set
+  /// the 'is_deriv' of '*debug_info' to the same value as the
+  /// debug info for 'submatrix_index', and set the 'node_index' to the
+  /// 'node_index' in the debug info for that submatrix-index.
+  /// It requires that computation_->matrix_debug_info be nonempty.
+  void AppendDebugInfoForSubmatrix(
+      int32 submatrix_index,
+      NnetComputation::MatrixDebugInfo *debug_info) const;
+
+  const Nnet &nnet_;
+  NnetComputation *computation_;
+
+  // Indexed by the original command index in *computation_ (and sized to the
+  // original number of commands in *computation_ before we added anything),
+  // extra_commands_[c] contains a list of commands that need to be inserted
+  // just before command c in the previously existing computation.
+  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
+
+  // This is as list of kBackprop commands that will be placed after the
+  // commands in 'computation_->commands' and 'extra_commands_', but before
+  // the 'final_deallocate_commands_'.
+  std::vector<NnetComputation::Command> final_commands_;
+  // This is a list of commands to deallocate our 'consolidated' matrices; the
+  // commands will be placed after the commands in 'final_commands_'.
+  std::vector<NnetComputation::Command> final_deallocate_commands_;
+};
+
+
 void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
     int32 submatrix_index,
     NnetComputation::MatrixDebugInfo *debug_info) const {
@@ -867,7 +1044,6 @@ void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
                              src_info.cindexes.begin() + row_end);
 }
 
-
 // see comment by declaration in header.
 int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     const std::vector<int32> &commands,
@@ -1041,6 +1217,19 @@ void ModelUpdateConsolidator::ConsolidateModelUpdate() {
   AddCommandsToComputation();
 }
 
+
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            NnetComputation *computation) {
+  // This following if-statement is an optimization: if the computation
+  // request(s) had need_model_derivative == false, there would be nothing to
+  // optimize, so don't bother trying.
+  if (!computation->need_model_derivative)
+    return;
+  ModelUpdateConsolidator consolidator(nnet, computation);
+  consolidator.ConsolidateModelUpdate();
+}
+
+
 // inline
 void DerivativeTimeLimiter::GetPruneValues(int32 initial_submatrix,
                                            int32 new_submatrix,
@@ -2315,18 +2504,29 @@ void FixGotoLabel(NnetComputation *computation) {
   int32 num_commands = computation->commands.size();
   if (num_commands == 0)
     return;
-  if (computation->commands[num_commands-1].command_type == kGotoLabel) {
-    int32 dest_command = computation->commands[num_commands-1].arg1;
-    if (static_cast<size_t>(dest_command) <  computation->commands.size() &&
-        computation->commands[dest_command].command_type == kNoOperationLabel)
-      return;  // nothing to fix.
-    for (int32 c = 0; c + 1 < num_commands; c++) {
-      if (computation->commands[c].command_type == kNoOperationLabel) {
-        computation->commands[num_commands-1].arg1 = c;
-        return;
+  for (int32 c = num_commands - 1; c >= 0; c--) {
+    if (computation->commands[c].command_type == kGotoLabel) {
+      int32 dest_command = computation->commands[c].arg1;
+      if (static_cast<size_t>(dest_command) <  computation->commands.size() &&
+          computation->commands[dest_command].command_type == kNoOperationLabel)
+        return;  // nothing to fix.
+      for (int32 d = 0; d + 1 < num_commands; d++) {
+        if (computation->commands[d].command_type == kNoOperationLabel) {
+          computation->commands[c].arg1 = d;
+          return;
+        }
       }
+      KALDI_ERR << "Label not found.";
+    } else if (computation->commands[c].command_type == kProvideOutput) {
+      // sometimes kProvideOutput commands are temporarily ordered after
+      // the kGotoLabel command, and we need to work in that case.
+      continue;
+    } else {
+      // it loks like there is no 'goto' command in this computation-
+      // if there were, it would be right at the end, possibly followed by
+      // kProvideOutput commands.
+      break;
     }
-    KALDI_ERR << "Label not found.";
   }
 }
 
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 29f05add695..e289ff9126c 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -182,182 +182,20 @@ class VariableMergingOptimizer {
 };
 
 
-/** This class is responsible for consolidating the model-update part of
-    backprop commands, for components in (e.g.) recurrent networks that need to
-    have many separate backprop commands, into more efficient single commands
-    operating on consolidated data in larger matrices.  This is useful for
-    recurrent networks.  */
-class ModelUpdateConsolidator {
- public:
-  ModelUpdateConsolidator(const Nnet &nnet,
-                          NnetComputation *computation);
-  void ConsolidateModelUpdate();
- private:
-  void ConsolidateUpdateForComponent(
-      int32 component,
-      const std::vector<int32> &backprop_commands);
-
-  /// This function, called at the end of ConsolidateModelUpdate(), takes the
-  /// commands that we have put in extra_commands_, final_commands_ and
-  /// final_deallocate_commands_, and puts them in the appropriate place in
-  /// computation->commands_.
-  void AddCommandsToComputation();
-
-  /// You call this function when you want to consolidate the values of a list
-  /// of submatrices taken just prior to particular commands.  The input
-  /// 'commands' and 'submatrices' lists must be the same size, and size must be
-  /// > 1.  This function will create a new matrix that is the row-wise
-  /// concatentation of all these submatrices, with values taken just prior to
-  /// the respective command indexes.  This function will will add to
-  /// extra_commands_ the commands to do the copying at the appropriate places
-  /// (at the supplied command indexes; they will be inserted just before).  The
-  /// return value is the submatrix index of a submatrix that represents the
-  /// whole of the consolidated matrix.  This command will insert, at the
-  /// beginning of the computation (in extra_commands_[0]), a command to
-  /// initialize the matrix; and will append to final_deallocate_commands_ the
-  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
-  /// nonempty, this function will also update computation_->matrix_debug_info
-  /// with suitable values for the newly added matrix
-  int32 ConsolidateSubmatrices(
-      const std::vector<int32> &commands,
-      const std::vector<int32> &submatrices);
-
-  /// This function, called from ConsolidateSubmatrices, will
-  /// update 'debug_info' by appending the corresponding 'indexes' from
-  /// the existing debug info for this submatrix.  It will also set
-  /// the 'is_deriv' of '*debug_info' to the same value as the
-  /// debug info for 'submatrix_index', and set the 'node_index' to the
-  /// 'node_index' in the debug info for that submatrix-index.
-  /// It requires that computation_->matrix_debug_info be nonempty.
-  void AppendDebugInfoForSubmatrix(
-      int32 submatrix_index,
-      NnetComputation::MatrixDebugInfo *debug_info) const;
-
-  const Nnet &nnet_;
-  NnetComputation *computation_;
-
-  // Indexed by the original command index in *computation_ (and sized to the
-  // original number of commands in *computation_ before we added anything),
-  // extra_commands_[c] contains a list of commands that need to be inserted
-  // just before command c in the previously existing computation.
-  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
-
-  // This is as list of kBackprop commands that will be placed after the
-  // commands in 'computation_->commands' and 'extra_commands_', but before
-  // the 'final_deallocate_commands_'.
-  std::vector<NnetComputation::Command> final_commands_;
-  // This is a list of commands to deallocate our 'consolidated' matrices; the
-  // commands will be placed after the commands in 'final_commands_'.
-  std::vector<NnetComputation::Command> final_deallocate_commands_;
-};
-
-
-// We declare this class in the .cc file, we don't need to export it.
-// It's used inside RenumberComputation.
-class ComputationRenumberer {
- public:
-  ComputationRenumberer(NnetComputation *computation):
-      computation_(computation) { }
-
-  void Renumber();
- private:
-  // this function removes unused vectors within the indexes_multi_ array, i.e.
-  // ones that are not referenced in the computation.
-  void RemoveUnusedIndexesMulti();
-  // this function computes the submatrix_is_used_ vector, saying whether each
-  // of the original submatrices is referenced somewhere.
-  void ComputeSubmatrixIsUsed();
-  // this function computes the matrix_is_used_ vector (from the
-  // submatrix_is_used_ vector, from computation_->input_output_info, and from
-  // computation_->commands, saying whether each of the original matrices is
-  // referenced somewhere, directly or indirectly.
-  void ComputeMatrixIsUsed();
-  // This function sets up mappings from old to new matrix and submatrix indexes,
-  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
-  void SetUpMappings();
-  // This function renumbers submatrix indexes appearing within commands and
-  // indexes_multi_, and then removes unused submatrices from the list of
-  // submatrices while leaving the matrix-indexes at their old values (they will
-  // be mapped by RenumberMatrices()).
-  void RenumberSubmatrices();
-  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
-  // and 'input_output_info'; renumber 'matrices' and if applicable
-  // 'debug_info'.
-  void RenumberMatrices();
-  // removes duplicates within the indexes_multi array itself.
-  void RemoveIndexesMultiDuplicates();
-  // removes unused elements and duplicates within 'computation->indexes'
-  void RenumberIndexes();
-  // removes unused elements and duplicates within 'computation->indexes_ranges'
-  void RenumberIndexesRanges();
-
-  struct SubMatrixHasher {
-    SubMatrixHasher() { }
-    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
-      // these numbers are arbitrarily chosen primes.
-      return submat.matrix_index +
-          19553 * submat.row_offset +
-          29297 * submat.num_rows +
-          42209 * submat.col_offset +
-          56527 * submat.num_cols;
-    }
-  };
 
+/**
+   This optimization consolidates
+   the model-update part of
+   backprop commands, for components in (e.g.) recurrent networks that need to
+   have many separate backprop commands, into more efficient single commands
+   operating on consolidated data in larger matrices.  This is useful for
+   recurrent networks.  The resulting computation separates the backprop for
+   data-derivatives from the model-update part of backprop.
+ */
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            NnetComputation *computation);
 
-  // Here, T will be int32 or std::pair<int32,int32>
-  template <class T>
-  struct PointerCompare {
-    // This provides an operator < on two vectors of ints or pairs of ints.  It
-    // is designed to provide a total order on the vectors while accessing as
-    // small a portion of the vectors' data as possible.  It's used in removing
-    // duplicates from computation_->indexes_multi and computation_->indexes.
-    // First it compares the length, then it does lexicographical compare.
-    bool operator ()(const std::vector<T> *ptr1,
-                     const std::vector<T> *ptr2) const {
-      size_t size1 = ptr1->size(), size2 = ptr2->size();
-      if (size1 < size2) return true;
-      else if (size1 > size2) return false;
-      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
-                                    // lexicographical comparison.
-    }
-  };
 
-  /// creates a renumbering that removes the elements in "to_remove",
-  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
-  /// the vector [ 0, -1, 1 ].
-  static void CreateRenumbering(int32 old_num_elements,
-                                const std::vector<int32> &to_remove,
-                                std::vector<int32> *renumbering);
-
-  /// creates a renumbering from old to new index that removes the unused
-  /// elements, e.g. if used == [ true, false, true, true], would output the
-  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
-  /// number of elements of 'used' that were true.
-  static int32 CreateRenumbering(const std::vector<bool> &used,
-                                 std::vector<int32> *renumbering);
-
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index is used somewhere in the computation (always true for
-  // the zeroth element).
-  std::vector<bool> submatrix_is_used_;
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index will be kept; this is like submatrix_is_used_; but for
-  // duplicate submatrices, all but the first duplicate will be marked false).
-  std::vector<bool> submatrix_is_kept_;
-  // vector of bool indexed by original-matrix-index > 0, that is true if a
-  // matrix-index is used somewhere in the computation, directly or indirectly.
-  // always true for the zeroth element.
-  std::vector<bool> matrix_is_used_;
-  NnetComputation *computation_;
-  int32 num_matrices_new_;
-  int32 num_submatrices_new_;
-  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
-                                         // new-matrix-index.  -1 for removed
-                                         // ones.
-  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
-                                            // gives new-submatrix-index.  -1
-                                            // for removed ones.
-};
 
 
 // Class DerivativeTimeLimiter is used inside LimitDerivativeTimes().
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index b4b38b5f736..6e4242ace09 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -352,19 +352,6 @@ void VariableMergingOptimization(const NnetOptimizeOptions &config,
   }
 }
 
-// This is a simplified top-level interface to the model-update consolidation
-// code from class ModelUpdateConsolidator.
-void ConsolidateModelUpdate(const Nnet &nnet,
-                            NnetComputation *computation) {
-  // This following if-statement is an optimization: if the computation
-  // request(s) had need_model_derivative == false, there would be nothing to
-  // optimize, so don't bother trying.
-  if (!computation->need_model_derivative)
-    return;
-  ModelUpdateConsolidator consolidator(nnet, computation);
-  consolidator.ConsolidateModelUpdate();
-}
-
 
 void ConvertAdditionToAssignment(const Nnet &nnet,
                                  NnetComputation *computation) {
@@ -689,6 +676,91 @@ static void SplitComputationIntoSegments(
   segments->push_back(std::pair<int32, int32>(cur_start, num_commands));
 }
 
+// This is a helper function used in ConsolidateIoOperations().
+//
+// Suppose we had something like this before ConsolidateIoOperations() (as would
+// be printed by Print()
+
+//  c90: output m50 to user [for node: 'output']
+//  ...
+//  c100: [label for goto statement]
+//  c101: # computation segment separator [e.g., begin backward commands]
+//  ...
+//  c105: m62 = user input [for node: 'input']
+//  ...
+//  c190: output m79 to user [for node: 'output']
+//  ...
+//  c200: goto c100
+//
+//  this would get reordered to the following by ConsolidateIoOperations
+//  (the bulk of the code, before this function is called):
+//
+//  c99: [label for goto statement]
+//  c100: output m50 to user [for node: 'output']
+//  c101: # computation segment separator [e.g., begin backward commands]
+//  c102: m62 = user input [for node: 'input']
+//  ...
+//  c199: goto c199
+//  c200: output m79 to user [for node: 'output']
+//
+// Now command c200 is unreachable, but there is a similar command at c100
+// (after the goto) that will substitute.  However, the matrix indexes are different.
+// So we need to change the above so that the last two commands read:
+//  c199: m50.swap(m79}
+//  c200: goto c199
+void FixGotoOutputReordering(const Nnet &nnet,
+                             NnetComputation *computation) {
+  FixGotoLabel(computation);  // make sure the destination label of the goto statement was
+                              // correct.
+  int32 goto_command_index = -1;
+  for (int32 c = computation->commands.size(); c >= 0; c--)
+    if (computation->commands[c].command_type == kGotoLabel)
+      goto_command_index = c;
+  KALDI_ASSERT(goto_command_index > 0);
+  int32 goto_label_index = computation->commands[goto_command_index].arg1;
+
+  std::vector<int32> output_commands_after_goto,
+      output_commands_after_label;
+  for (int32 c = goto_command_index + 1;
+       c < static_cast<int32>(computation->commands.size()); c++) {
+    KALDI_ASSERT(computation->commands[c].command_type == kProvideOutput);
+    output_commands_after_goto.push_back(c);
+  }
+  for (int32 c = goto_label_index + 1;
+       c < goto_command_index; c++) {  // note: we break from this loop.
+    CommandType t = computation->commands[c].command_type;
+    if (t == kProvideOutput)
+      output_commands_after_label.push_back(c);
+    else if (t != kNoOperationMarker && t != kAcceptInput)
+      break;
+  }
+  if (output_commands_after_goto.size() != output_commands_after_label.size()) {
+    computation->Print(std::cerr, nnet);
+    KALDI_ERR << "Could not fix goto/output reordering, size mismatch.";
+  }
+  NnetComputation::Command goto_command = computation->commands[goto_command_index];
+  // be we'll be replacing the final kProvideOutput commands with
+  // kAllocMatrixFromOther [i.e. swap commands], and moving them one command
+  // backward; later we'll put the goto command at the end.
+  for (size_t i = 0; i < output_commands_after_goto.size(); i++) {
+    int32 c1 = output_commands_after_label[i],
+        c2 = output_commands_after_goto[i],
+        new_c2 = c2 - 1;
+    int32 s1 = computation->commands[c1].arg1,
+        s2 = computation->commands[c2].arg1;
+    // The following assert checks that the network node-index is the same...
+    // the idea is that the outputs should have been provided in the same order.
+    // I can think of no reason why the order might be different.
+    KALDI_ASSERT(computation->commands[c1].arg2 ==
+                 computation->commands[c1].arg2);
+    computation->commands[new_c2].command_type = kAllocMatrixFromOther;
+    computation->commands[new_c2].arg1 = s1;
+    computation->commands[new_c2].arg2 = s2;
+  }
+  // ... and move the goto command to the end.
+  computation->commands.back() = goto_command;
+}
+
 
 void ConsolidateIoOperations(const Nnet &nnet,
                              NnetComputation *computation) {
@@ -745,23 +817,8 @@ void ConsolidateIoOperations(const Nnet &nnet,
   }
   computation->commands.swap(reordered_commands);
 
-  if (ends_with_goto) {
-    // If, before this operation, the last command was kGotoLael, remove all
-    // commands that have been reordered to go after the kGotoLabel command
-    // [they would be unreachable anyway.]  This relates to looped computations.
-    // It may seem wrong that we are just removing these
-    // kAcceptInput/kProvideOutput commands, but the reason it's OK
-    // (and preserves equivalence with the code prior to this function call),
-    // is that the corresponding commands have also been moved past the
-    // kNoOperationLabel command that the goto jumps to, so those commands
-    // will actually get run.
-    // We don't actually check this here (it would lead to a crash when
-    // the computation was executed, if something is wrong in this logic).
-    while (!computation->commands.empty() &&
-           computation->commands.back().command_type != kGotoLabel)
-      computation->commands.pop_back();
-    FixGotoLabel(computation);
-  }
+  if (ends_with_goto)
+    FixGotoOutputReordering(nnet, computation);
 }
 
 
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 27871552017..c01563f11cb 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -266,6 +266,11 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
 /// This optimization puts the input operations (kAcceptInput) and output
 /// operations (kProvideOutput) at the very beginning or end of segments of
 /// computation, respectively.
+///
+/// This is actually necessary for computations to be run easily, because if these
+/// commands were interspersed with the regular commands, you'd have to
+/// call computer.Run() between the individual AcceptInput() and GetOutput()
+/// function calls.
 void ConsolidateIoOperations(const Nnet &nnet,
                              NnetComputation *computation);
 
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index eca7c6b2075..205fc031323 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1035,6 +1035,8 @@ void GenerateConfigSequence(
       GenerateConfigSequenceCompositeBlock(opts, configs);
       break;
     case 10:
+      if (!opts.allow_statistics_pooling)
+        goto start;
       GenerateConfigSequenceStatistics(opts, configs);
       break;
     case 11:
diff --git a/src/nnet3/nnet-test-utils.h b/src/nnet3/nnet-test-utils.h
index b6976f70ab1..a9616281bdc 100644
--- a/src/nnet3/nnet-test-utils.h
+++ b/src/nnet3/nnet-test-utils.h
@@ -40,6 +40,7 @@ struct NnetGenerationOptions {
   bool allow_final_nonlinearity;
   bool allow_use_of_x_dim;
   bool allow_ivector;
+  bool allow_statistics_pooling;
   // if set to a value >0, the output-dim of the network
   // will be set to this value.
   int32 output_dim;
@@ -54,6 +55,7 @@ struct NnetGenerationOptions {
       allow_final_nonlinearity(true),
       allow_use_of_x_dim(true),
       allow_ivector(false),
+      allow_statistics_pooling(true),
       output_dim(-1) { }
 };
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index ed20257c7fe..dbe676de1ef 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -707,6 +707,14 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
 }
 
 
+/// Returns true if 'nnet' has some kind of recurrency.
+bool NnetIsRecurrent(const Nnet &nnet) {
+  std::vector<std::vector<int32> > graph;
+  NnetToDirectedGraph(nnet, &graph);
+  return GraphHasCycles(graph);
+}
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 55fcddd7f58..8acdbfd9b96 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -129,6 +129,9 @@ void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);
 void AddNnetComponents(const Nnet &src, const Vector<BaseFloat> &alphas,
                        BaseFloat scale, Nnet *dest);
 
+/// Returns true if 'nnet' has some kind of recurrency.
+bool NnetIsRecurrent(const Nnet &nnet);
+
 /// Returns the total of the number of parameters in the updatable components of
 /// the nnet.
 int32 NumParameters(const Nnet &src);

From ee9b9633f3e19464fe648063eeaaf2bd42459bc3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 13 Nov 2016 19:43:49 -0500
Subject: [PATCH 018/213] Add decoding program nnet3-latgen-faster-looped

---
 src/nnet3bin/Makefile                      |   2 +-
 src/nnet3bin/nnet3-latgen-faster-looped.cc | 266 +++++++++++++++++++++
 2 files changed, 267 insertions(+), 1 deletion(-)
 create mode 100644 src/nnet3bin/nnet3-latgen-faster-looped.cc

diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index d46c56a1044..fd576404f1d 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -17,7 +17,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-merge-egs nnet3-discriminative-shuffle-egs \
    nnet3-discriminative-compute-objf nnet3-discriminative-train \
    discriminative-get-supervision nnet3-discriminative-subset-egs \
-   nnet3-discriminative-compute-from-egs
+   nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped
 
 OBJFILES =
 
diff --git a/src/nnet3bin/nnet3-latgen-faster-looped.cc b/src/nnet3bin/nnet3-latgen-faster-looped.cc
new file mode 100644
index 00000000000..ee6867ff352
--- /dev/null
+++ b/src/nnet3bin/nnet3-latgen-faster-looped.cc
@@ -0,0 +1,266 @@
+// nnet3bin/nnet3-latgen-faster-looped.cc
+
+// Copyright 2012-2016   Johns Hopkins University (author: Daniel Povey)
+//                2014   Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "nnet3/decodable-simple-looped.h"
+#include "base/timer.h"
+
+
+int main(int argc, char *argv[]) {
+  // note: making this program work with GPUs is as simple as initializing the
+  // device, but it probably won't make a huge difference in speed for typical
+  // setups.
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices using nnet3 neural net model.\n"
+        "[this version uses the 'looped' computation, which may be slightly faster for\n"
+        "many architectures, but should not be used for backwards-recurrent architectures\n"
+        "such as BLSTMs.\n"
+        "Usage: nnet3-latgen-faster-looped [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    LatticeFasterDecoderConfig config;
+    NnetSimpleLoopedComputationOptions decodable_opts;
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    config.Register(&po);
+    decodable_opts.Register(&po);
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        lattice_wspecifier = po.GetArg(4),
+        words_wspecifier = po.GetOptArg(5),
+        alignment_wspecifier = po.GetOptArg(6);
+
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(model_in_filename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    Int32VectorWriter words_writer(words_wspecifier);
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                 &am_nnet);
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+      // Input FST is just one FST, not a table of FSTs.
+      VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_str);
+      timer.Reset();
+
+      {
+        LatticeFasterDecoder decoder(*decode_fst, config);
+
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          const Matrix<BaseFloat> &features (feature_reader.Value());
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+          const Matrix<BaseFloat> *online_ivectors = NULL;
+          const Vector<BaseFloat> *ivector = NULL;
+          if (!ivector_rspecifier.empty()) {
+            if (!ivector_reader.HasKey(utt)) {
+              KALDI_WARN << "No iVector available for utterance " << utt;
+              num_fail++;
+              continue;
+            } else {
+              ivector = &ivector_reader.Value(utt);
+            }
+          }
+          if (!online_ivector_rspecifier.empty()) {
+            if (!online_ivector_reader.HasKey(utt)) {
+              KALDI_WARN << "No online iVector available for utterance " << utt;
+              num_fail++;
+              continue;
+            } else {
+              online_ivectors = &online_ivector_reader.Value(utt);
+            }
+          }
+
+
+        DecodableAmNnetSimpleLooped nnet_decodable(
+            decodable_info, trans_model, features, ivector, online_ivectors,
+            online_ivector_period);
+
+          double like;
+          if (DecodeUtteranceLatticeFaster(
+                  decoder, nnet_decodable, trans_model, word_syms, utt,
+                  decodable_opts.acoustic_scale, determinize, allow_partial,
+                  &alignment_writer, &words_writer, &compact_lattice_writer,
+                  &lattice_writer,
+                  &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
+      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      for (; !fst_reader.Done(); fst_reader.Next()) {
+        std::string utt = fst_reader.Key();
+        if (!feature_reader.HasKey(utt)) {
+          KALDI_WARN << "Not decoding utterance " << utt
+                     << " because no features available.";
+          num_fail++;
+          continue;
+        }
+        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_fail++;
+          continue;
+        }
+
+        LatticeFasterDecoder decoder(fst_reader.Value(), config);
+
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
+          }
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            online_ivectors = &online_ivector_reader.Value(utt);
+          }
+        }
+
+        DecodableAmNnetSimpleLooped nnet_decodable(
+            decodable_info, trans_model, features, ivector, online_ivectors,
+            online_ivector_period);
+
+        double like;
+        if (DecodeUtteranceLatticeFaster(
+                decoder, nnet_decodable, trans_model, word_syms, utt,
+                decodable_opts.acoustic_scale, determinize, allow_partial,
+                &alignment_writer, &words_writer, &compact_lattice_writer,
+                &lattice_writer, &like)) {
+          tot_like += like;
+          frame_count += features.NumRows();
+          num_success++;
+        } else num_fail++;
+      }
+    }
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From 7327c038554bcfdde5c72f2c5e7ac94127483afa Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 13 Nov 2016 21:33:18 -0500
Subject: [PATCH 019/213] Fix bug discovered by testing code

---
 src/nnet3/decodable-simple-looped.cc       | 9 ++++++---
 src/nnet3bin/nnet3-latgen-faster-looped.cc | 6 +++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index 9e580dc121f..0df4c3b6c31 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -85,8 +85,10 @@ void DecodableNnetSimpleLoopedInfo::Init(
   CompileLooped(*nnet, opts_.optimize_config, request1, request2, request3,
                 &computation_);
   computation_.ComputeCudaIndexes();
-  KALDI_LOG << "Computation is:";
-  computation_.Print(std::cerr, *nnet);
+  if (GetVerboseLevel() >= 3) {
+    KALDI_VLOG(3) << "Computation is:";
+    computation_.Print(std::cerr, *nnet);
+  }
 }
 
 
@@ -141,7 +143,8 @@ void DecodableNnetSimpleLooped::AdvanceChunk() {
     // note: end is last plus one.
     end_input_frame = info_.frames_per_chunk_ + info_.frames_right_context_;
   } else {
-    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk_;
+    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk_ +
+        info_.frames_right_context_;
     end_input_frame = begin_input_frame + info_.frames_per_chunk_;
   }
   CuMatrix<BaseFloat> feats_chunk(end_input_frame - begin_input_frame,
diff --git a/src/nnet3bin/nnet3-latgen-faster-looped.cc b/src/nnet3bin/nnet3-latgen-faster-looped.cc
index ee6867ff352..9ad20fd8764 100644
--- a/src/nnet3bin/nnet3-latgen-faster-looped.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-looped.cc
@@ -171,9 +171,9 @@ int main(int argc, char *argv[]) {
           }
 
 
-        DecodableAmNnetSimpleLooped nnet_decodable(
-            decodable_info, trans_model, features, ivector, online_ivectors,
-            online_ivector_period);
+          DecodableAmNnetSimpleLooped nnet_decodable(
+              decodable_info, trans_model, features, ivector, online_ivectors,
+              online_ivector_period);
 
           double like;
           if (DecodeUtteranceLatticeFaster(

From f37d42268eba1473ad63fba47cc5972ae2bc132b Mon Sep 17 00:00:00 2001
From: Tom Ko <tomkocse@gmail.com>
Date: Wed, 30 Nov 2016 22:08:51 -0500
Subject: [PATCH 020/213] Fix bug discovered by TDNN decoding script

---
 src/nnet3/decodable-simple-looped.cc | 25 +++++++++++++++++++------
 src/nnet3/decodable-simple-looped.h  |  5 +++++
 src/nnet3/nnet-compile-looped.cc     | 14 ++++++++------
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index 0df4c3b6c31..bb9a38632a1 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -72,7 +72,6 @@ void DecodableNnetSimpleLoopedInfo::Init(
   if (has_ivectors_)
     ModifyNnetIvectorPeriod(ivector_period, nnet);
 
-  ComputationRequest request1, request2, request3;
   int32 num_sequences = 1;  // we're processing one utterance at a time.
   int32 extra_right_context = 0;
   CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk_,
@@ -80,9 +79,9 @@ void DecodableNnetSimpleLoopedInfo::Init(
                                        ivector_period, opts.extra_left_context_initial,
                                        extra_right_context,
                                        num_sequences,
-                                       &request1, &request2, &request3);
+                                       &request1_, &request2_, &request3_);
 
-  CompileLooped(*nnet, opts_.optimize_config, request1, request2, request3,
+  CompileLooped(*nnet, opts_.optimize_config, request1_, request2_, request3_,
                 &computation_);
   computation_.ComputeCudaIndexes();
   if (GetVerboseLevel() >= 3) {
@@ -172,11 +171,25 @@ void DecodableNnetSimpleLooped::AdvanceChunk() {
   computer_.AcceptInput("input", &feats_chunk);
 
   if (info_.has_ivectors_) {
+    KALDI_ASSERT(info_.request1_.inputs.size() == 2);
+    // all but the 1st chunk should have 1 iVector, but no need
+    // to assume this.
+    int32 num_ivectors = (num_chunks_computed_ == 0 ?
+			  info_.request1_.inputs[1].indexes.size() :
+			  info_.request2_.inputs[1].indexes.size());
+    KALDI_ASSERT(num_ivectors > 0);
+
     Vector<BaseFloat> ivector;
+    // we just get the iVector from the last input frame we needed...
+    // we don't bother trying to be 'accurate' in getting the iVectors
+    // for their 'correct' frames, because in general using the
+    // iVector from as large 't' as possible will be better.
     GetCurrentIvector(end_input_frame, &ivector);
-    CuMatrix<BaseFloat> cu_ivector(1, ivector.Dim());
-    cu_ivector.Row(0).CopyFromVec(ivector);
-    computer_.AcceptInput("ivector", &cu_ivector);
+    Matrix<BaseFloat> ivectors(num_ivectors,
+			       ivector.Dim());
+    ivectors.CopyRowsFromVec(ivector);
+    CuMatrix<BaseFloat> cu_ivectors(ivectors);
+    computer_.AcceptInput("ivector", &cu_ivectors);
   }
   computer_.Run();
 
diff --git a/src/nnet3/decodable-simple-looped.h b/src/nnet3/decodable-simple-looped.h
index fe40c220f8f..5aba5b10505 100644
--- a/src/nnet3/decodable-simple-looped.h
+++ b/src/nnet3/decodable-simple-looped.h
@@ -148,6 +148,11 @@ class DecodableNnetSimpleLoopedInfo  {
   // to accept the iVectors
   bool has_ivectors_;
 
+  // The 3 computation requests that are used to create the looped
+  // computation are stored in the class, as we need them to work out
+  // exactly shich iVectors are needed.
+  ComputationRequest request1_, request2_, request3_;
+  
   // The compiled, 'looped' computation.
   NnetComputation computation_;
 };
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index 71329d2e8fe..d77f19ef13c 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -80,8 +80,9 @@ int32 GetChunkSize(const Nnet &nnet,
 /// for negative a is not specified (except by relation with the division '/'
 /// operator), but in practice it would be <= 0 for almost all implementations.
 template<class I> I  Mod(I m, I n) {
-  if (m >= 0) return m % n;
-  else return -((-m) % n);
+  I ans = m % n;
+  if (ans < 0) ans += n;
+  return ans;
 }
 
 
@@ -171,15 +172,16 @@ void CreateLoopedComputationRequestSimple(const Nnet &nnet,
     }
     for (int32 t = chunk2_input_begin_t; t < chunk2_input_end_t; t++) {
       int32 ivector_t = t - Mod(t, ivector_period);
-      if (ivector_times1.count(ivector_t) == 0)
+      if (ivector_times2.count(ivector_t) == 0 &&
+	  ivector_times1.count(ivector_t) == 0)
         ivector_times2.insert(ivector_t);
     }
     for (int32 t = chunk3_input_begin_t; t < chunk3_input_end_t; t++) {
       int32 ivector_t = t - Mod(t, ivector_period);
-      if (ivector_times1.count(ivector_t) == 0 &&
-          ivector_times2.count(ivector_t) == 0) {
+      if (ivector_times3.count(ivector_t) == 0 &&
+          ivector_times2.count(ivector_t) == 0 &&
+	  ivector_times1.count(ivector_t) == 0)
         ivector_times3.insert(ivector_t);
-      }
     }
   }
 

From 570e82f460579c5b60e2cc8a817af05890dbbb2c Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Fri, 9 Dec 2016 13:58:45 -0800
Subject: [PATCH 021/213] Adding another optimization to convert row-wise to
 whole-matrix ops where possible.

---
 src/cudamatrix/cu-math.cc          |  12 +-
 src/nnet3/nnet-compile-looped.cc   |   4 +-
 src/nnet3/nnet-compile.cc          |   4 -
 src/nnet3/nnet-compute-test.cc     |   4 +-
 src/nnet3/nnet-derivative-test.cc  |   8 +-
 src/nnet3/nnet-optimize-test.cc    |   4 +-
 src/nnet3/nnet-optimize-utils.cc   | 190 ++++++++++++++++++++++++++++-
 src/nnet3/nnet-optimize-utils.h    |  81 +++++++++++-
 src/nnet3/nnet-optimize.cc         |  74 ++++++++---
 src/nnet3/nnet-optimize.h          | 114 +++++++++++++----
 src/nnet3/nnet-simple-component.cc | 115 ++++++-----------
 11 files changed, 469 insertions(+), 141 deletions(-)

diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 047e808ae03..bb55302313a 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -481,15 +481,15 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
     //  Sigmoid(i_t_input), Sigmoid(f_t_input),
     //  Tanh(c_part), Sigmoid(o_t_input),  Tanh(c_t)
     Real i_t_self_repair = (
-        deriv_sum_in(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
+        deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
     Real f_t_self_repair = (
-        deriv_sum_in(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
+        deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
     Real c_part_self_repair = (
-        deriv_sum_in(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
+        deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
     Real o_t_self_repair = (
-        deriv_sum_in(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
+        deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
     Real c_t_self_repair = (
-        deriv_sum_in(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
+        deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
     // Note on how we add self-repair for sigmoids/tanh's.  If self-repair
     // is activated for this unit, then...
     // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
@@ -605,7 +605,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
       // deriv_sum_out and deriv_sum_in might point to the same memory.
       for (int32 i = 0; i < 5; i++)
         (*self_repair_sum_out_mat)(i, c) =
-            (deriv_sum_in(i, c) / count < sr_config(i) ? num_rows : 0);
+            (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);
 
       (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
       (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index d77f19ef13c..62f29762580 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -18,6 +18,7 @@
 // limitations under the License.
 
 #include "nnet3/nnet-compile-looped.h"
+#include "nnet3/nnet-optimize-utils.h"
 #include "nnet3/nnet-utils.h"
 
 namespace kaldi {
@@ -295,7 +296,8 @@ static bool CompileLoopedInternal(
   compiler.CreateComputation(compiler_opts, computation);
   optimize_opts.optimize_looped_computation = true;
 
-  Optimize(optimize_opts, nnet, computation);
+  Optimize(optimize_opts, nnet,
+           MaxOutputTimeInRequest(request3), computation);
 
   return computation->commands.size() != 0 &&
       computation->commands.back().command_type == kGotoLabel;
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index ab4ea9917e3..930887d85ea 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -537,10 +537,6 @@ void Compiler::DoForwardComputationFromIndexes(
   }
   // if we got to here, it's not just a case of matrix-copy or matrix-add,
   // but it's still from a single source matrix.
-  // TODO: detect the case where the indexes are contiguous, but possibly
-  // with -1's at the beginning or end (e.g. [ -1 2 3 4 5 6 7 8 ]) and make
-  // it a standard matrix-copy command with new sub-matrices added as needed,
-  // possibly with a subset of the rows in the original sub-matrices.
   int32 indexes_index = computation->indexes.size();
   computation->indexes.push_back(indexes);
   CommandType ctype =
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index 81cc67f71ae..c6a271abfbe 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -172,7 +172,9 @@ void UnitTestNnetCompute() {
     if (RandInt(0, 1) == 0) {
       NnetOptimizeOptions opt_config;
 
-      Optimize(opt_config, nnet, &computation);
+      Optimize(opt_config, nnet,
+               MaxOutputTimeInRequest(request),
+               &computation);
       {
         std::ostringstream os;
         computation.Print(os, nnet);
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index 511a6dc6bf9..5dbc8a126d1 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -139,7 +139,9 @@ void UnitTestNnetModelDerivatives() {
       if (limit_deriv_times)
         SetDerivTimesOptions(request, &opt_config);
 
-      Optimize(opt_config, nnet, &computation);
+      Optimize(opt_config, nnet,
+               MaxOutputTimeInRequest(request),
+               &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
@@ -303,7 +305,9 @@ void UnitTestNnetInputDerivatives() {
     if (RandInt(0, 3) != 0 && allow_optimization) {
       NnetOptimizeOptions opt_config;
       // opt_config.initialize_undefined = false;  // temp
-      Optimize(opt_config, nnet, &computation);
+      Optimize(opt_config, nnet,
+               MaxOutputTimeInRequest(request),
+               &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 7b64d67b72c..40f8d824a39 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -71,7 +71,9 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     NnetComputation computation_opt(computation);
 
     {
-      Optimize(opt_config, nnet, &computation_opt);
+      Optimize(opt_config, nnet,
+               MaxOutputTimeInRequest(request),
+               &computation_opt);
       std::ostringstream os;
       computation_opt.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 4d199d4a0d6..6744eb91e37 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -691,8 +691,7 @@ bool VariableMergingOptimizer::MergeVariables() {
        command_index++) {
     // This loop looks for pairs of sub-matrix indexes s1,s2 that we could
     // potentially merge into a single variable.
-    const NnetComputation::Command &c =
-        computation_->commands[command_index];
+    const NnetComputation::Command &c = computation_->commands[command_index];
     int32 s1 = -1, s2 = -1;
     if (c.command_type == kMatrixCopy &&
         config_.remove_assignments) {
@@ -1843,6 +1842,7 @@ void DerivativeTimeLimiter::PruneMatrices() {
     LimitMatrices(will_limit);
 }
 
+
 void LimitDerivativeTimes(const Nnet &nnet,
                           int32 min_deriv_time,
                           int32 max_deriv_time,
@@ -1853,6 +1853,187 @@ void LimitDerivativeTimes(const Nnet &nnet,
 }
 
 
+/*
+  This helper function, used in ReplaceRowWithMatrixOps, detects
+  when the vector 'indexes' has a 'special structure'.  The special structure
+  is:
+    zero or more -1's, then
+    a consecutive nonempty sequence of nonnegative numbers, e.g. 6 7 8 9 10, then
+    zero or more -1's.
+
+  Note: this function assumes that any negative elements of 'indexes' are -1.
+  If there are elements less than -1, then it is an error, but this function
+  does not thoroughly check for that.  'indexes' is required to be a nonempty
+  vector.
+
+  If 'indexes' has the special structure then this function returns true
+  and sets the following values, which will explain with the following
+  example in mind: 'indexes = [ -1, -1, 5 6 7 8, -1 ]'.
+     - '*first_nonnegative_pos' is set to the number of initial -1's (and also
+       the location of the first nonnegative element): 2 in this case.
+     - '*first_nonnegative_value' is set to the value of the first nonnegative
+       element (5 in this case)
+     - '*num_nonnegative_values' is set to the number of nonnegative values in
+       the sequence (4 in this case).
+  If 'indexes' does not have this special structure, then this function returns
+  false, and the values of '*first_nonnegative_pos',
+  '*first_nonnegative_value' and '*num_nonnegative_indexes' on exit are
+  undefined.
+*/
+static bool IndexesHaveSpecialStructure(const std::vector<int32> &indexes,
+                                        int32 *first_nonnegative_pos,
+                                        int32 *first_nonnegative_value,
+                                        int32 *num_nonnegative_indexes) {
+  KALDI_ASSERT(!indexes.empty());
+  const int32 *indexes_ptr = &(indexes[0]);
+  size_t pos = 0, size = indexes.size();
+
+  // Find the first nonnegative element of 'indexes'.
+  for (; pos < size; ++pos)
+    if (indexes_ptr[pos] >= 0)
+      break;
+  if (pos == size)
+    return false;  // all -1's... should not happen, but not our problem.
+  *first_nonnegative_pos = static_cast<int32>(pos);
+  int32 n = indexes_ptr[pos];
+  *first_nonnegative_value = n;
+  // Find the first element after '*first_nonnegative_index' that isn't
+  // consecutive.
+  for (; pos < size; ++pos,++n)
+    if (indexes_ptr[pos] != n)
+      break;
+
+  *num_nonnegative_indexes = n - *first_nonnegative_value;
+
+  // Check that the remaining values are all <0 (assumed equal to -1, but
+  // checking <0 may be faster as just one instruction).
+  for (; pos < size; ++pos)
+    if (indexes_ptr[pos] >= 0)
+      return false;  // does not have the special structure.
+
+  return true;
+}
+
+
+
+bool ReplaceRowWithMatrixOps(NnetComputation *computation) {
+  bool ans = false;
+  int32 num_commands = computation->commands.size(),
+      num_indexes = computation->indexes.size();
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    // non-const because we'll be changing it.
+    NnetComputation::Command &c = computation->commands[command_index];
+
+    int32 first_nonnegative_pos,
+        first_nonnegative_value,
+        num_nonnegative_indexes;
+    switch (c.command_type) {
+      case kCopyRows: case kAddRows: {
+        int32 indexes_index = c.arg3;
+        KALDI_ASSERT(indexes_index < num_indexes);
+        const std::vector<int32> &indexes = computation->indexes[indexes_index];
+        if (IndexesHaveSpecialStructure(indexes,
+                                        &first_nonnegative_pos,
+                                        &first_nonnegative_value,
+                                        &num_nonnegative_indexes)) {
+          ans = true;
+          c.arg1 = computation->NewSubMatrix(c.arg1, first_nonnegative_pos,
+                                             num_nonnegative_indexes,
+                                             0, -1);
+          c.arg2 = computation->NewSubMatrix(c.arg2, first_nonnegative_value,
+                                             num_nonnegative_indexes,
+                                             0, -1);
+          c.command_type = (c.command_type == kCopyRows ? kMatrixCopy :
+                            kMatrixAdd);
+        }
+        break;
+      }
+      default:
+        continue;
+    }
+  }
+  return ans;
+}
+
+// This class implements the internals of the ExpandComputation() function (used
+// in shortcut compilation); see comment by the declaration of
+// ExpandComputation() in nnet-optimize-utils.h for overview.
+class ComputationExpander {
+ public:
+  ComputationExpander(const NnetComputation &computation,
+                      bool need_debug_info,
+                      int32 num_n_values,
+                      NnetComputation *expanded_computation):
+      computation_(computation),
+      need_debug_info_(need_debug_info),
+      num_n_values_(num_n_values),
+      expanded_computation_(expanded_computation) {
+    KALDI_ASSERT(num_n_values > 2);
+  }
+
+  // This function call implements the functionality of the class,
+  // expanding the computation.
+  bool Expand();
+
+ private:
+  // This function sets up and computes the 'n_fast' vector (see comment
+  // by it for what this is.
+  void InitFastInfo();
+
+  // This function sets up the 'matrices' vector in 'expanded_computation_'.
+  // It's quite simple: it just multiplies all the num-rows by num_n_values_ and
+  // divides by 2, and leaves the num-cols the same.
+  void ComputeMatrices();
+
+  // This function, only called if need_debug_info_ is true, sets up
+  // the 'matrix_debug_info' vector in 'expanded_computation_'.
+  void ComputeDebugInfo();
+
+  // This function sets up the 'submatrices' vector in 'expanded_computation_'.
+  // Column ranges always stay the same, but for row ranges it's a little
+  // more complicated.
+  void ComputeSubmatrixInfo();
+
+
+  // This function computes all the PrecomputedIndexes in the
+  // 'component_precomputed_indexes' member of 'expanded_computation_'.
+  // They are all generated from scratch, by using the Component::PrecomputedIndexes()
+  // member function.  The 'input_indexes' and 'output_indexes' arguments are worked
+  // out from the 'debug_info' [if we're not generating debug_info we specially generate
+  // it for the specific matrices in question], and the 'need_backprop'
+  // argument is worked out by seeing whether there is a call to Backprop() with
+  // the same precomputed-indexes element.
+  void ComputePrecomputedIndexes();
+
+  // Computes the 'commands' member of the output.  This function also adds as
+  // needed to 'indexes', 'indexes_multi' and 'indexes_ranges' in the output.
+  // Later on we can call RenumberComputation() to remove any duplicates that
+  // might result from this.
+  void ComputeCommands();
+
+
+  // This 'n_fast' vector is indexed by the matrix-index in the computation,
+  // i.e. the same index as indexes computation_.matrix_info and
+  // expanded_computation_->matrix_info.  For each matrix-index m > 0 it
+  // contains true if the 'n' varies 'fast', or false if the 'n' index varies
+  // 'slowly'.  By 'fast' and 'slow', we mean in the same sense as is desribed
+  // in the comment for ComputationIsDecomposable() in nnet-optimize-utils.h.
+  std::vector<bool> n_fast;
+
+
+
+
+
+
+  const NnetComputation &computation_;
+  bool need_debug_info_;
+  int32 num_n_values_;
+  NnetComputation *expanded_computation_;
+};
+
+
+
 class ComputationLoopedOptimizer {
  public:
   ComputationLoopedOptimizer(const Nnet &nnet,
@@ -2017,11 +2198,8 @@ class ComputationLoopedOptimizer {
   std::vector<std::pair<int32, int32> > matrix_to_pair_;
 
   std::vector<int32> segment_end_commands_;
-
-
 };
 
-
 // static
 int32 ComputationLoopedOptimizer::FindTimeShift(
     const NnetComputation &computation,
@@ -2500,6 +2678,7 @@ void OptimizeLoopedComputation(const Nnet &nnet,
 }
 
 
+
 void FixGotoLabel(NnetComputation *computation) {
   int32 num_commands = computation->commands.size();
   if (num_commands == 0)
@@ -2531,6 +2710,5 @@ void FixGotoLabel(NnetComputation *computation) {
 }
 
 
-
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index e289ff9126c..f3f27a12c8e 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -336,6 +336,8 @@ class DerivativeTimeLimiter {
   std::vector<MatrixPruneInfo> prune_info_;
 };
 
+
+
 // This is the top-level interface to limit the times on which derivatives are
 // computed (e.g. for truncated BPTT); internally it uses class
 // DerivativeLimiter.  Will do nothing if min_deriv_time and max_deriv_time are
@@ -345,11 +347,80 @@ void LimitDerivativeTimes(const Nnet &nnet,
                           int32 max_deriv_time,
                           NnetComputation *computation);
 
-/// This function detects submatrices, matrices, and members of indexes_multi
-/// and indexes that are never used (e.g. due to changes made in other
-/// optimization code), and removes them from the computation by way of suitable
-/// renumbering.  It does not remove no-ops from computation->commands_; to do
-/// that, call RemoveNoOps(computation).
+/**  This function, used in 'shortcut' compilation where we first compile a
+     smaller computation with the same structure but only 2 distinct 'n'
+     values, works out whether a computation is 'decomposable'; if so,
+     it returns true and outputs the 'mini_request' with the same structure,
+     and the number of 'n' values.
+
+     A computation is decomposable if the following conditions hold:
+
+      - All of its inputs and outputs contain 'n' values for all 0 <= n < N,
+        for some N > 2.  [we output this 'N' as 'num_n_values'].
+      - All of its inputs and outputs have 'regular' structure.
+
+        What it means for an input or output (i.e. an IoSpecification) to have a
+        'regular' structure, is as follows:
+          - The 't' and 'x' values present are the same for each 'n',
+          - The order in which the indexes appear is EITHER of the following:
+             - The 'n' index varies 'fast', i.e. the order is:
+                 (t1,x1,0), (t1,x1,1) ... (t1,x1,N-1) \
+                 (t2,x2,0), (t2,x2,1) ... (t2,x2,N-1)  ...
+             - The 'n' index varies 'slowly', i.e. the order is:
+                 (t1,x1,0), (t2,x2,0) ...  \
+                 (t1,x1,1), (t2,x2,1) ...  \
+                 ...                       \
+                 (t1,x2,N-1), (t2,x2,N-1) ...
+            In either case, there does not have to be any particular rhyme or
+            reason to the order of the t and x values; the regularity on 'n' is
+            all that we care about.
+ */
+bool ComputationIsDecomposable(const ComputationRequest &request,
+                               ComputationRequest *mini_request,
+                               int32 *num_n_values);  // TODO: implement this.
+
+
+/**
+  This function is used in 'shortcut' compilation to expand a computation
+  that has been compiled for exactly 2 'n' values, to one that is suitable
+  for some num_n_values > 2.
+     @param [in] computation  The computation that was compiled for exactly
+                              2 'n' values (n=0 and n=1)
+     @param [in] need_debug_info True if we want to retain the 'debug_info'
+                              in the output 'expanded_computation'.  In any
+                              case, the 'debug_info' is required in the
+                              input computation.
+     @param [in] num_n_values The number of 'n' values we want in the output
+                              computation
+     @param [out] expanded_computation  The expanded computation.
+
+     @return  This function returns true if it succeeded, and false if it
+              could not expand the computation for some reason (e.g. there
+              was some non-simple component where the 'PrecomputedIndexes'
+              object could not be suitably expanded.  If it returns false,
+              the output 'expanded_computation' is undefined (may contain junk).
+ */
+bool ExpandComputation(const NnetComputation &computation,
+                       bool need_debug_info,
+                       int32 num_n_values,
+                       NnetComputation *expanded_computation);
+
+
+
+/// This function detects cases where commands of type kCopyRows, kAddRows or
+/// kAddToRows can be converted to commands of type kMatrixCopy or kMatrixAdd,
+/// and converts them (this may involve adding submatrices).  After doing this
+/// you should at some point do RenumberComputation(), which will remove any
+/// now-unused members of computation->indexes.
+/// This function returns true if it made any changes to the computation.
+bool ReplaceRowWithMatrixOps(NnetComputation *computation);
+
+/// This function detects submatrices and matrices that are never used (e.g. due
+/// to changes made in other optimization code), and members of indexes,
+/// indexes_multi and indexes_ranges that are unused or are duplicates, and
+/// removes them from the computation by way of suitable renumbering.  It does
+/// not remove no-ops from computation->commands_; to do that, call
+/// RemoveNoOps(computation).
 void RenumberComputation(NnetComputation *computation);
 
 /// Removes commands of type kNoOperation in the computation.
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 6e4242ace09..a1a62e3944c 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -34,7 +34,13 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &propagate_in_place);
   ExpectToken(is, binary, "<BackpropInPlace>");
   ReadBasicType(is, binary, &backprop_in_place);
-  ExpectToken(is, binary, "<ConvertAddition>");
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<ReplaceRowWithMatrixOps>") {
+    ReadBasicType(is, binary, &replace_row_with_matrix_ops);
+    ReadToken(is, binary, &tok);
+  }
+  KALDI_ASSERT(tok == "<ConvertAddition>");
   ReadBasicType(is, binary, &convert_addition);
   ExpectToken(is, binary, "<RemoveAssignments>");
   ReadBasicType(is, binary, &remove_assignments);
@@ -52,7 +58,7 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &min_deriv_time);
   ExpectToken(is, binary, "<MaxDerivTime>");
   ReadBasicType(is, binary, &max_deriv_time);
-  std::string tok;
+
   ReadToken(is, binary, &tok);
   if (tok == "<MaxDerivTimeRelative>") {
     ReadBasicType(is, binary, &max_deriv_time_relative);
@@ -73,6 +79,8 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, propagate_in_place);
   WriteToken(os, binary, "<BackpropInPlace>");
   WriteBasicType(os, binary, backprop_in_place);
+  WriteToken(os, binary, "<ReplaceRowWithMatrixOps>");
+  WriteBasicType(os, binary, replace_row_with_matrix_ops);
   WriteToken(os, binary, "<ConvertAddition>");
   WriteBasicType(os, binary, convert_addition);
   WriteToken(os, binary, "<RemoveAssignments>");
@@ -403,8 +411,27 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
   }
 }
 
+
+int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
+  int32 ans = std::numeric_limits<int32>::min();
+  for (size_t i = 0; i < request.outputs.size(); i++) {
+    const std::vector<Index> &indexes (request.outputs[i].indexes);
+    std::vector<Index>::const_iterator iter = indexes.begin(),
+        end = indexes.end();
+    for (; iter != end; ++iter)
+      if (iter->t > ans)
+        ans = iter->t;
+  }
+  if (ans == std::numeric_limits<int32>::min()) {
+    KALDI_ERR << "Failed to find any output indexes in computation request.";
+  }
+  return ans;
+}
+
+
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
+              int32 max_output_time_in_request,
               NnetComputation *computation) {
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, *computation, true);
@@ -415,7 +442,7 @@ void Optimize(const NnetOptimizeOptions &config,
     int32 max_deriv_time = config.max_deriv_time;
     if (config.max_deriv_time_relative != std::numeric_limits<int32>::max())
       max_deriv_time = config.max_deriv_time_relative +
-          MaxOutputTimeInRequest(request);
+          max_output_time_in_request;
     LimitDerivativeTimes(nnet, config.min_deriv_time,
                          max_deriv_time, computation);
   }
@@ -443,6 +470,21 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, false);
   }
 
+  if (config.optimize && config.replace_row_with_matrix_ops) {
+    if (ReplaceRowWithMatrixOps(computation)) {
+      // if anything was changed...
+
+      // We have to call RenumberComputation() to get rid of any removed
+      // indexes... actually this could be a little wasteful, but unfortunately
+      // it doesn't seem like we'd otherwise be doing any renumbering past this
+      // point.
+      RenumberComputation(computation);
+      if (GetVerboseLevel() >= 4)
+        CheckComputation(nnet, *computation, false);
+    }
+  }
+
+
   if (config.optimize && config.initialize_undefined) {
     RemoveUnnecessaryZeroing(nnet, computation);
     if (GetVerboseLevel() >= 4)
@@ -510,32 +552,32 @@ size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spe
                   // it makes the hasher faster.
   StringHasher string_hasher;
   ans = string_hasher(spec.name);
-  std::vector<Index>::const_iterator itr = spec.indexes.begin(),
+  std::vector<Index>::const_iterator iter = spec.indexes.begin(),
       end = spec.indexes.end(),
       med = end;
-  if (med > itr + n)
+  if (med > iter + n)
     med = iter + n;
 
-  for (; itr != med; ++itr) {
-    ans += (*itr).n * 1619;
-    ans += (*itr).t * 15649;
-    ans += (*itr).x * 89809;
+  for (; iter != med; ++iter) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
   }
   // after the first 'n' values, look only at every n'th value.  this makes the
   // hashing much faster, and in the kinds of structures that we actually deal
   // with, we shouldn't get unnecessary hash collisions as a result of this
   // optimization.
-  for (; iter < end; itr += n) {
-    ans += (*itr).n * 1619;
-    ans += (*itr).t * 15649;
-    ans += (*itr).x * 89809;
+  for (; iter < end; iter += n) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
   }
   return ans;
 }
 
 void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
                                             NnetComputation *computation) {
-  if (computation_cache_.size() == cache_capacity_) {
+  if (computation_cache_.size() == config_.cache_capacity) {
     // full, locate the least-recently-accessed request
     const CacheType::iterator it =
         computation_cache_.find(access_queue_.front());
@@ -635,7 +677,9 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
       ComputationChecker checker(check_config, nnet_, *computation);
       checker.Check();
     }
-    Optimize(opt_config_, nnet_, computation);
+    Optimize(opt_config_, nnet_,
+             MaxOutputTimeInRequest(*request),
+             computation);
     if (GetVerboseLevel() >= verbose_cutoff) {
       std::ostringstream os;
       computation->Print(os, nnet_);
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index c01563f11cb..86c6427396a 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -29,7 +29,7 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Options class for optimizing a NnetComputation The main projected use for
+// Options class for optimizing a NnetComputation.  The main projected use for
 // this is in debugging the optimization code itself, so that if an error is
 // detected, we can work out which optimization was responsible for the error.
 struct NnetOptimizeOptions {
@@ -37,6 +37,7 @@ struct NnetOptimizeOptions {
   bool consolidate_model_update;
   bool propagate_in_place;
   bool backprop_in_place;
+  bool replace_row_with_matrix_ops;
   bool convert_addition;
   bool remove_assignments;
   bool allow_left_merge;
@@ -46,25 +47,29 @@ struct NnetOptimizeOptions {
   bool allocate_from_other;
   int32 min_deriv_time;
   int32 max_deriv_time;
+  int32 max_deriv_time_relative;
   // optimize_looped_computation is a 'hidden config' not available from
   // the command line; it's set to true to enable the optimization for
   // looped computation that turns a linear computation into a loop.
   bool optimize_looped_computation;
 
-  NnetOptimizeOptions(): optimize(true),
-                         consolidate_model_update(true),
-                         propagate_in_place(true),
-                         backprop_in_place(true),
-                         convert_addition(true),
-                         remove_assignments(true),
-                         allow_left_merge(true),
-                         allow_right_merge(true),
-                         initialize_undefined(true),
-                         move_sizing_commands(true),
-                         allocate_from_other(true),
-                         min_deriv_time(std::numeric_limits<int32>::min()),
-                         max_deriv_time(std::numeric_limits<int32>::max()),
-                         optimize_looped_computation(false) { }
+  NnetOptimizeOptions():
+      optimize(true),
+      consolidate_model_update(true),
+      propagate_in_place(true),
+      backprop_in_place(true),
+      replace_row_with_matrix_ops(true),
+      convert_addition(true),
+      remove_assignments(true),
+      allow_left_merge(true),
+      allow_right_merge(true),
+      initialize_undefined(true),
+      move_sizing_commands(true),
+      allocate_from_other(true),
+      min_deriv_time(std::numeric_limits<int32>::min()),
+      max_deriv_time(std::numeric_limits<int32>::max()),
+      max_deriv_time_relative(std::numeric_limits<int32>::max()),
+      optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "
@@ -104,15 +109,51 @@ struct NnetOptimizeOptions {
                    "the maximum t value that you want derivatives to be computed "
                    "at when updating the model.  This is an optimization that "
                    "saves time in the backprop phase for recurrent frameworks");
+    opts->Register("max-deriv-time-relative", &max_deriv_time_relative,
+                   "An alternative mechanism for setting the --max-deriv-time, "
+                   "suitable for situations where the length of the egs is "
+                   "variable.  If set, it is equivalent to setting the "
+                   "--max-deriv-time to this value plus the largest 't' value "
+                   "in any 'output' node of the computation request.");
   }
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &os, bool binary) const;
   bool operator == (const NnetOptimizeOptions &other) const;
 };
 
-/// This is the top-level function for optimizing a computation.
+
+/* This utility function, used in code that calls LimitDerivativeTimes() (and
+   required in code that calls Optimize(), returns the largest time
+   't' in any of the 'outputs' in the computation request, or crashes if there
+   are no outputs (or no cindexes in those outputs). */
+int32 MaxOutputTimeInRequest(const ComputationRequest &request);
+
+
+/** This is the top-level function for optimizing a computation.  Note: it
+    should really be called OptimizeAndPostprocess(), because there is at least
+    one thing it does (reordering I/O commands) that is necessary for a
+    computation to be run.
+
+    @param [in] config   The options that control, among other things,
+                         which optimizations to apply.
+    @param [in] nnet     The neural net for which the computation is being built
+    @param [in] max_output_time_in_request  This value is only needed when the
+                         max-deriv-time-relative config value is set in
+                         'config'.  It should be set to the largest 't' value
+                         encountered in any of the indexes in the 'output'
+                         IoSpecifications in the ComputationRequests used to
+                         compile the computation.  However if there are multiple
+                         ComputationRequests (i.e. it was an online computation)
+                         you can just set it to any value you want, because
+                         backpropagation is not supported so the
+                         max-deriv-time-relative configuration value would not
+                         have any effect.
+    @param [in,out] computation  The computation to be optimized; this function
+                         modifies it in-place.
+ */
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
+              int32 max_output_time_in_request,
               NnetComputation *computation);
 
 // Hash function for ComputationRequest. It converts
@@ -134,20 +175,49 @@ struct ComputationRequestPtrEqual {
   }
 };
 
+
+
+struct CachingOptimizingCompilerOptions {
+  bool use_shortcut;
+  int32 write_cache;
+  int32 cache_capacity;
+
+
+
+  CachingOptimizingCompilerOptions():
+      use_shortcut(true),
+      cache_capacity(64) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("use-shortcut", &use_shortcut,
+                   "If true, use the 'shortcut' in compilation whereby "
+                   "computation requests with regular structure are identified "
+                   "as such, a computation with a smaller number of distinct "
+                   "values of 'n' is compiled (e.g. 2), and the compiled "
+                   "computation is expanded to match the size of the real "
+                   "computation request.");
+    opts->Register("cache-capacity", &cache_capacity,
+                   "Determines how many computations the computation-cache will "
+                   "store (most-recently-used).");
+  }
+};
+
 /// This class enables you to do the compilation and optimization in one call,
 /// and also ensures that if the ComputationRequest is identical to the previous
 /// one, the compilation process is not repeated.
 class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
-                           const int32 capacity = 20):
-      nnet_(nnet), cache_capacity_(capacity) { }
+                            const CachingOptimizingCompilerOptions config =
+                            CachingOptimizingCompilerOptions()):
+      nnet_(nnet), config_(config) { }
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
-                            const int32 capacity = 20):
-      nnet_(nnet), opt_config_(opt_config), cache_capacity_(capacity) { }
+                            const CachingOptimizingCompilerOptions config =
+                            CachingOptimizingCompilerOptions()):
+      nnet_(nnet), config_(config), opt_config_(opt_config) { }
 
   ~CachingOptimizingCompiler();
   /// Does the compilation and returns a const pointer to
@@ -159,6 +229,7 @@ class CachingOptimizingCompiler {
   void WriteCache(std::ostream &os, bool binary) const;
  private:
   const Nnet &nnet_;
+  CachingOptimizingCompilerOptions config_;
   NnetOptimizeOptions opt_config_;
 
   // The access queue for keeping track of the freshness of computation.
@@ -186,9 +257,6 @@ class CachingOptimizingCompiler {
                    NnetComputation *computation);
   // This function updates the recently accessed queue.
   void UpdateAccessQueue(CacheType::iterator &cit);
-  // This configuration value determines how many unique Computations
-  // to cache in our most-recently-used cache.
-  int32 cache_capacity_;
 };
 
 
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 5935b4dacad..84a262b1695 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -87,34 +87,27 @@ void PnormComponent::Write(std::ostream &os, bool binary) const {
 }
 
 
-void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion,
-                            bool dropout_per_frame) {
+void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion) {
   dropout_proportion_ = dropout_proportion;
-  dropout_per_frame_ = dropout_per_frame;
   dim_ = dim;
 }
 
 void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   int32 dim = 0;
   BaseFloat dropout_proportion = 0.0;
-  bool dropout_per_frame = false;
   bool ok = cfl->GetValue("dim", &dim) &&
     cfl->GetValue("dropout-proportion", &dropout_proportion);
-  cfl->GetValue("dropout-per-frame", &dropout_per_frame);
-    // for this stage, dropout is hard coded in
-    // normal mode if not declared in config
   if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0)
-       KALDI_ERR << "Invalid initializer for layer of type "
-                 << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim, dropout_proportion, dropout_per_frame);
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(dim, dropout_proportion);
 }
 
 std::string DropoutComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_
-         << ", dropout-proportion=" << dropout_proportion_
-         << ", dropout-per-frame=" << (dropout_per_frame_ ? "true" : "false");
+         << ", dropout-proportion=" << dropout_proportion_;
   return stream.str();
 }
 
@@ -126,29 +119,16 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
-  if (!dropout_per_frame_) {
-    // This const_cast is only safe assuming you don't attempt
-    // to use multi-threaded code with the GPU.
-    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-    out->Add(-dropout);  // now, a proportion "dropout" will be <0.0
-    // apply the function (x>0?1:0).  Now, a proportion
-    // "dropout" will be zero and (1 - dropout) will be 1.0.
-    out->ApplyHeaviside();
+  // This const_cast is only safe assuming you don't attempt
+  // to use multi-threaded code with the GPU.
+  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-    out->MulElements(in);
-  } else {
-    // randomize the dropout matrix by row,
-    // i.e. [[1,1,1,1],[0,0,0,0],[0,0,0,0],[1,1,1,1],[0,0,0,0]]
-    CuMatrix<BaseFloat> tmp(1, out->NumRows(), kUndefined);
-    // This const_cast is only safe assuming you don't attempt
-    // to use multi-threaded code with the GPU.
-    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&tmp);
-    tmp.Add(-dropout);
-    tmp.ApplyHeaviside();
-    out->CopyColsFromVec(tmp.Row(0));
-    out->MulElements(in);
-  }
+  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                         // be zero and (1 - dropout) will be 1.0.
+
+  out->MulElements(in);
 }
 
 
@@ -170,25 +150,11 @@ void DropoutComponent::Backprop(const std::string &debug_info,
 
 
 void DropoutComponent::Read(std::istream &is, bool binary) {
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<DropoutComponent>") {
-    ReadToken(is, binary, &token);
-  }
-  KALDI_ASSERT(token == "<Dim>");
-  ReadBasicType(is, binary, &dim_);  // read dimension.
-  ReadToken(is, binary, &token);
-  KALDI_ASSERT(token == "<DropoutProportion>");
-  ReadBasicType(is, binary, &dropout_proportion_);  // read dropout rate
-  ReadToken(is, binary, &token);
-  if (token == "<DropoutPerFrame>") {
-    ReadBasicType(is, binary, &dropout_per_frame_);  // read dropout mode
-    ReadToken(is, binary, &token);
-    KALDI_ASSERT(token == "</DropoutComponent>");
-  } else {
-    dropout_per_frame_ = false;
-    KALDI_ASSERT(token == "</DropoutComponent>");
-  }
+  ExpectOneOrTwoTokens(is, binary, "<DropoutComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<DropoutProportion>");
+  ReadBasicType(is, binary, &dropout_proportion_);
+  ExpectToken(is, binary, "</DropoutComponent>");
 }
 
 void DropoutComponent::Write(std::ostream &os, bool binary) const {
@@ -197,8 +163,6 @@ void DropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dim_);
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
-  WriteToken(os, binary, "<DropoutPerFrame>");
-  WriteBasicType(os, binary, dropout_per_frame_);
   WriteToken(os, binary, "</DropoutComponent>");
 }
 
@@ -1550,7 +1514,7 @@ void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
        num_repeats, param_stddev, bias_mean, bias_stddev);
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	          << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -2378,13 +2342,12 @@ std::string ConstantFunctionComponent::Info() const {
 }
 
 ConstantFunctionComponent::ConstantFunctionComponent():
-    UpdatableComponent(), input_dim_(-1), is_updatable_(true),
-    use_natural_gradient_(true) { }
+    input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
 
 ConstantFunctionComponent::ConstantFunctionComponent(
     const ConstantFunctionComponent &other):
-    UpdatableComponent(other), input_dim_(other.input_dim_),
-    output_(other.output_), is_updatable_(other.is_updatable_),
+    input_dim_(other.input_dim_), output_(other.output_),
+    is_updatable_(other.is_updatable_),
     use_natural_gradient_(other.use_natural_gradient_),
     preconditioner_(other.preconditioner_) { }
 
@@ -3652,7 +3615,7 @@ void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
   }
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -3741,7 +3704,8 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                               kUndefined);
   InputToInputPatches(in, &patches);
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
   std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
       filter_params_batch;
 
@@ -3895,9 +3859,10 @@ void ConvolutionComponent::Backprop(const std::string &debug_info,
                                        kSetZero);
 
   std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-      filter_params_batch;
+	  filter_params_batch;
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
 
   for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
@@ -3974,8 +3939,9 @@ void ConvolutionComponent::Update(const std::string &debug_info,
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
       filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-          filters_grad_blocks_batch.RowRange(
-              patch_number * filters_grad.NumRows(), filters_grad.NumRows())));
+              filters_grad_blocks_batch.RowRange(
+				      patch_number * filters_grad.NumRows(),
+				    filters_grad.NumRows())));
 
       input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
               input_patches.ColRange(patch_number * filter_dim, filter_dim)));
@@ -4447,7 +4413,7 @@ void PermuteComponent::InitFromConfig(ConfigLine *cfl) {
               << column_map_str;
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -5144,13 +5110,6 @@ Component* LstmNonlinearityComponent::Copy() const {
   return new LstmNonlinearityComponent(*this);
 }
 
-void LstmNonlinearityComponent::ZeroStats() {
-  value_sum_.SetZero();
-  deriv_sum_.SetZero();
-  self_repair_total_.SetZero();
-  count_ = 0.0;
-}
-
 void LstmNonlinearityComponent::Scale(BaseFloat scale) {
   params_.Scale(scale);
   value_sum_.Scale(scale);
@@ -5352,12 +5311,14 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
 
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (!ok)
+	      << cfl->UnusedValues();
+  if (ok) {
+    Init(cell_dim, param_stddev, tanh_self_repair_threshold,
+         sigmoid_self_repair_threshold, self_repair_scale);
+  } else {
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(cell_dim, param_stddev, tanh_self_repair_threshold,
-       sigmoid_self_repair_threshold, self_repair_scale);
+  }
 }
 
 

From f337886b9aca29a9013798b1523242fefd12d8a2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 7 Dec 2016 21:13:33 -0500
Subject: [PATCH 022/213] Early parts of 'shortcut' compilation

---
 src/nnet3/nnet-optimize-utils.cc | 1501 +++++++-----------------------
 src/nnet3/nnet-optimize-utils.h  |  413 +++++---
 src/nnet3/nnet-optimize.cc       |  338 ++-----
 src/nnet3/nnet-optimize.h        |   70 +-
 4 files changed, 694 insertions(+), 1628 deletions(-)

diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 6744eb91e37..b2ebb22ad71 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -33,12 +33,8 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kAllocMatrixZeroed:
     case kAllocMatrixUndefined:
     case kDeallocMatrix:
-      submatrix_args->push_back(&c->arg1);
-      break;
     case kAllocMatrixFromOther:
     case kAllocMatrixFromOtherZeroed:
-      submatrix_args->push_back(&c->arg1);
-      submatrix_args->push_back(&c->arg2);
       break;
     case kPropagate:
       submatrix_args->push_back(&c->arg3);
@@ -68,13 +64,8 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kCopyToRowsMulti:
       submatrix_args->push_back(&c->arg1);
       break;
-    case kAcceptInput: case kProvideOutput:
-      submatrix_args->push_back(&c->arg1);
-      break;
     case kNoOperation:
     case kNoOperationMarker:
-    case kNoOperationLabel:
-    case kGotoLabel:
       break;
     default:
       KALDI_ERR << "Unknown command type.";
@@ -96,13 +87,40 @@ void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
 }
 
 
+void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *commands,
+                        std::vector<int32*> *matrix_args) {
+  matrix_args->clear();
+  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
+      end = commands->end();
+  std::vector<int32*> this_matrix_args;
+  for (; iter != end; ++iter) {
+    IdentifyMatrixArgs(&(*iter), &this_matrix_args);
+    matrix_args->insert(matrix_args->end(),
+                        this_matrix_args.begin(),
+                        this_matrix_args.end());
+  }
+}
+
 
-void IdentifyMatrixArgsInComputation(NnetComputation *computation,
+void IdentifyMatrixArgsInComputation(bool include_in_submatrices,
+                                     NnetComputation *computation,
                                      std::vector<int32*> *matrix_args) {
+  IdentifyMatrixArgs(&(computation->commands), matrix_args);
   int32 num_submatrices = computation->submatrices.size();
-  matrix_args->reserve(computation->submatrices.size());
-  for (int32 s = 1; s < num_submatrices; s++)
-    matrix_args->push_back(&(computation->submatrices[s].matrix_index));
+  matrix_args->reserve(matrix_args->size() +
+                       (include_in_submatrices ?
+                        computation->submatrices.size() : 0) +
+                       2 * computation->input_output_info.size());
+  if (include_in_submatrices)
+    for (int32 s = 1; s < num_submatrices; s++)
+      matrix_args->push_back(&(computation->submatrices[s].matrix_index));
+  unordered_map<int32, std::pair<int32, int32> >::iterator
+      iter = computation->input_output_info.begin(),
+      end = computation->input_output_info.end();
+  for (; iter != end; ++iter) {
+    matrix_args->push_back(&(iter->second.first));
+    matrix_args->push_back(&(iter->second.second));
+  }
 }
 
 
@@ -147,112 +165,26 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
   }
 }
 
-// We declare this class in the .cc file, we don't need to export it.
-// It's used inside RenumberComputation.
-class ComputationRenumberer {
- public:
-  ComputationRenumberer(NnetComputation *computation):
-      computation_(computation) { }
-
-  void Renumber();
- private:
-  // this function removes unused vectors within the indexes_multi_ array, i.e.
-  // ones that are not referenced in the computation.
-  void RemoveUnusedIndexesMulti();
-  // this function computes the submatrix_is_used_ vector, saying whether each
-  // of the original submatrices is referenced somewhere.
-  void ComputeSubmatrixIsUsed();
-  // this function computes the matrix_is_used_ vector (from the
-  // submatrix_is_used_ vector, from computation_->input_output_info, and from
-  // computation_->commands, saying whether each of the original matrices is
-  // referenced somewhere, directly or indirectly.
-  void ComputeMatrixIsUsed();
-  // This function sets up mappings from old to new matrix and submatrix indexes,
-  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
-  void SetUpMappings();
-  // This function renumbers submatrix indexes appearing within commands and
-  // indexes_multi_, and then removes unused submatrices from the list of
-  // submatrices while leaving the matrix-indexes at their old values (they will
-  // be mapped by RenumberMatrices()).
-  void RenumberSubmatrices();
-  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
-  // and 'input_output_info'; renumber 'matrices' and if applicable
-  // 'debug_info'.
-  void RenumberMatrices();
-  // removes duplicates within the indexes_multi array itself.
-  void RemoveIndexesMultiDuplicates();
-  // removes unused elements and duplicates within 'computation->indexes'
-  void RenumberIndexes();
-  // removes unused elements and duplicates within 'computation->indexes_ranges'
-  void RenumberIndexesRanges();
-
-  struct SubMatrixHasher {
-    SubMatrixHasher() { }
-    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
-      // these numbers are arbitrarily chosen primes.
-      return submat.matrix_index +
-          19553 * submat.row_offset +
-          29297 * submat.num_rows +
-          42209 * submat.col_offset +
-          56527 * submat.num_cols;
-    }
-  };
-
-
-  // Here, T will be int32 or std::pair<int32,int32>
-  template <class T>
-  struct PointerCompare {
-    // This provides an operator < on two vectors of ints or pairs of ints.  It
-    // is designed to provide a total order on the vectors while accessing as
-    // small a portion of the vectors' data as possible.  It's used in removing
-    // duplicates from computation_->indexes_multi and computation_->indexes.
-    // First it compares the length, then it does lexicographical compare.
-    bool operator ()(const std::vector<T> *ptr1,
-                     const std::vector<T> *ptr2) const {
-      size_t size1 = ptr1->size(), size2 = ptr2->size();
-      if (size1 < size2) return true;
-      else if (size1 > size2) return false;
-      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
-                                    // lexicographical comparison.
-    }
-  };
-
-  /// creates a renumbering that removes the elements in "to_remove",
-  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
-  /// the vector [ 0, -1, 1 ].
-  static void CreateRenumbering(int32 old_num_elements,
-                                const std::vector<int32> &to_remove,
-                                std::vector<int32> *renumbering);
-
-  /// creates a renumbering from old to new index that removes the unused
-  /// elements, e.g. if used == [ true, false, true, true], would output the
-  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
-  /// number of elements of 'used' that were true.
-  static int32 CreateRenumbering(const std::vector<bool> &used,
-                                 std::vector<int32> *renumbering);
-
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index is used somewhere in the computation (always true for
-  // the zeroth element).
-  std::vector<bool> submatrix_is_used_;
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index will be kept; this is like submatrix_is_used_; but for
-  // duplicate submatrices, all but the first duplicate will be marked false).
-  std::vector<bool> submatrix_is_kept_;
-  // vector of bool indexed by original-matrix-index > 0, that is true if a
-  // matrix-index is used somewhere in the computation, directly or indirectly.
-  // always true for the zeroth element.
-  std::vector<bool> matrix_is_used_;
-  NnetComputation *computation_;
-  int32 num_matrices_new_;
-  int32 num_submatrices_new_;
-  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
-                                         // new-matrix-index.  -1 for removed
-                                         // ones.
-  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
-                                            // gives new-submatrix-index.  -1
-                                            // for removed ones.
-};
+
+
+void IdentifyMatrixArgs(NnetComputation::Command *c,
+                        std::vector<int32*> *matrix_args) {
+  matrix_args->clear();
+  switch (c->command_type) {
+    case kAllocMatrixZeroed:
+    case kAllocMatrixUndefined:
+    case kDeallocMatrix:
+      matrix_args->push_back(&c->arg1);
+      break;
+    case kAllocMatrixFromOther:
+    case kAllocMatrixFromOtherZeroed:
+      matrix_args->push_back(&c->arg1);
+      matrix_args->push_back(&c->arg2);
+      break;
+    default:
+      break;
+  }
+}
 
 // static
 int32 ComputationRenumberer::CreateRenumbering(
@@ -344,10 +276,22 @@ void ComputationRenumberer::ComputeMatrixIsUsed() {
   matrix_is_used_.clear();
   matrix_is_used_.resize(computation_->matrices.size(), false);
   matrix_is_used_[0] = true;
+
+  std::vector<int32*> matrix_args;
+  bool include_in_submatrices = false;
+  IdentifyMatrixArgsInComputation(include_in_submatrices,
+                                  computation_, &matrix_args);
+  std::vector<int32*>::iterator iter = matrix_args.begin(),
+      end = matrix_args.end();
+  for (; iter != end; ++iter) {
+    int32 matrix_index = **iter;
+    if (matrix_index > 0)
+      matrix_is_used_[matrix_index] = true;
+  }
   // We also need to take into account when matrices are used indirectly via
   // submatrices (which is actually the main way they are accessed).
-  int32 num_submatrices = computation_->submatrices.size();
-  for (int32 s = 1; s < num_submatrices; s++) {
+  int32 num_submatrices_orig = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices_orig; s++) {
     int32 matrix_index = computation_->submatrices[s].matrix_index;
     if (submatrix_is_used_[s])
       matrix_is_used_[matrix_index] = true;
@@ -411,15 +355,20 @@ void ComputationRenumberer::RenumberSubmatrices() {
 
 void ComputationRenumberer::RenumberMatrices() {
   std::vector<int32*> matrix_args;
-  int32 num_submatrices = computation_->submatrices.size();
-  for (int32 s = 1; s < num_submatrices; s++) {
-    int32 *matrix_index = &(computation_->submatrices[s].matrix_index);
-    // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
-    // submatrices that are never accessed, and these should never appear
-    // in this list.  (presumably because we renumber the submatrices first).
-    int32 new_matrix_index = old_to_new_matrix_[*matrix_index];
-    KALDI_ASSERT(new_matrix_index > 0);
-    *matrix_index = new_matrix_index;
+  bool include_in_submatrices = true;
+  IdentifyMatrixArgsInComputation(include_in_submatrices,
+                                  computation_, &matrix_args);
+  std::vector<int32*>::iterator iter = matrix_args.begin(),
+      end = matrix_args.end();
+  for (; iter != end; ++iter) {
+    if (**iter > 0) {
+      int32 new_matrix_index = old_to_new_matrix_[**iter];
+      // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
+      // submatrices that are never accessed, and these should never appear
+      // in this list.
+      KALDI_ASSERT(new_matrix_index > 0);
+      **iter = new_matrix_index;
+    }
   }
 
   std::vector<NnetComputation::MatrixInfo> new_matrices;
@@ -652,7 +601,6 @@ void RenumberComputation(NnetComputation *computation) {
   renumberer.Renumber();
 }
 
-
 void RemoveNoOps(NnetComputation *computation) {
   std::vector<NnetComputation::Command>::iterator
       input_iter = computation->commands.begin(),
@@ -667,12 +615,87 @@ void RemoveNoOps(NnetComputation *computation) {
   computation->commands.resize(output_iter - computation->commands.begin());
 }
 
+/// Wherever matrix orig_matrix_index appears in the input of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInInput(
+    const Nnet &nnet,
+    int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation) {
+  bool ans = false;
+  int32 num_matrices = computation->matrices.size();
+  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
+               new_matrix_index > 0 && new_matrix_index < num_matrices);
+  unordered_map<int32, std::pair<int32, int32> >::iterator
+      iter = computation->input_output_info.begin(),
+      end = computation->input_output_info.end();
+  for (; iter != end; ++iter) {
+    int32 network_node = iter->first,
+        &value_matrix_index = iter->second.first,
+        &deriv_matrix_index = iter->second.second;
+    if (nnet.IsOutputNode(network_node)) {
+      // deriv_matrix_index would be an input to the computation.
+      if (deriv_matrix_index == orig_matrix_index) {
+        deriv_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    } else {
+      // value_matrix_index would be an input to the computation.
+      if (value_matrix_index == orig_matrix_index) {
+        value_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    }
+  }
+  return ans;
+}
+
+
+/// Wherever matrix orig_matrix_index appears in the output of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInOutput(
+    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation) {
+  bool ans = false;
+  int32 num_matrices = computation->matrices.size();
+  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
+               new_matrix_index > 0 && new_matrix_index < num_matrices);
+  unordered_map<int32, std::pair<int32, int32> >::iterator
+      iter = computation->input_output_info.begin(),
+      end = computation->input_output_info.end();
+  for (; iter != end; ++iter) {
+    int32 network_node = iter->first,
+        &value_matrix_index = iter->second.first,
+        &deriv_matrix_index = iter->second.second;
+    if (nnet.IsOutputNode(network_node)) {
+      // value_matrix_index would be an output of the computation.
+      if (value_matrix_index == orig_matrix_index) {
+        value_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    } else {
+      // deriv_matrix_index would be an output of the computation.
+      if (deriv_matrix_index == orig_matrix_index) {
+        // we'd only have derivatives for actual inputs. [note: we also allow
+        // users to provide inputs for component nodes, but these would not have
+        // derivatives.]
+        KALDI_ASSERT(nnet.IsInputNode(network_node));
+        deriv_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    }
+  }
+  return ans;
+}
+
 
 VariableMergingOptimizer::VariableMergingOptimizer(
     const NnetOptimizeOptions &config,
     const Nnet &nnet,
+    const ComputationRequest &request,
     NnetComputation *computation):
-    config_(config), nnet_(nnet),
+    config_(config), nnet_(nnet), request_(request),
     computation_(computation),
     already_called_merge_variables_(false) {
   analyzer_.Init(nnet, *computation);
@@ -691,7 +714,8 @@ bool VariableMergingOptimizer::MergeVariables() {
        command_index++) {
     // This loop looks for pairs of sub-matrix indexes s1,s2 that we could
     // potentially merge into a single variable.
-    const NnetComputation::Command &c = computation_->commands[command_index];
+    const NnetComputation::Command &c =
+        computation_->commands[command_index];
     int32 s1 = -1, s2 = -1;
     if (c.command_type == kMatrixCopy &&
         config_.remove_assignments) {
@@ -723,10 +747,10 @@ bool VariableMergingOptimizer::MergeVariables() {
     if (s1 > 0 && s2 > 0) {
       std::pair<bool,bool> p = MayBeMerged(command_index, s1, s2);
       if (p.first) {
-        DoMerge(command_index, s1, s2);
+        DoLeftMerge(command_index, s1, s2);
         merged = true;
       } else if (p.second) {
-        DoMerge(command_index, s2, s1);
+        DoRightMerge(command_index, s1, s2);
         merged = true;
       }
     }
@@ -776,33 +800,45 @@ void VariableMergingOptimizer::MarkAsDirty(int32 s) {
   }
 }
 
-void VariableMergingOptimizer::DoMerge(int32 command_index,
-                                       int32 s_to_keep,
-                                       int32 s_to_discard) {
-  // Prevent further optimizations touching either submatrix (we can try again
-  // in a later round of optimization, with a new instance of this class).
-  MarkAsDirty(s_to_keep);
-  MarkAsDirty(s_to_discard);
-
-  int32 m_to_keep = computation_->submatrices[s_to_keep].matrix_index,
-      m_to_discard = computation_->submatrices[s_to_discard].matrix_index;
-  KALDI_ASSERT(m_to_keep != m_to_discard && m_to_keep > 0 && m_to_discard > 0);
-
-  { // modify submatrices of m_to_discard to effectively be sub-matrices of
-    // s_to_keep instead (they will refer to m_to_keep as the matrix_index).
-    std::vector<int32>::const_iterator iter =
-        matrix_to_submatrix_[m_to_discard].begin(),
-        end = matrix_to_submatrix_[m_to_discard].end();
+void VariableMergingOptimizer::DoRightMerge(int32 command_index,
+                                            int32 s1, int32 s2) {
+  // Prevent further optimizations touching s1 or s2 (we can
+  // try again in a later round of optimization, with a new
+  // instance of this class).
+  MarkAsDirty(s1);
+  MarkAsDirty(s2);
+
+  int32 m1 = computation_->submatrices[s1].matrix_index,
+      m2 = computation_->submatrices[s2].matrix_index;
+  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
+  { // modify submatrices for submatrices of m1 to effectively be sub-matrices of
+    // s2 instead (they will refer to m2 as the matrix_index).
+    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m1].begin(),
+        end = matrix_to_submatrix_[m1].end();
     for (; iter != end; ++iter) {
       int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index
-                   == m_to_discard);
+      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m1);
       computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index,
-                                  s_to_keep);
+          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s2);
     }
   }
-
+  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
+  // - If m1 was an input, replace it as an input with m2
+  bool replaced = ReplaceInInput(nnet_, m1, m2, computation_);
+  KALDI_ASSERT(replaced == matrix_accesses[m1].is_input);
+  if (replaced) {  // Remove the command that allocates m2.
+    int32 alloc_command = matrix_accesses[m2].allocate_command;
+    KALDI_ASSERT(alloc_command != -1);
+    computation_->commands[alloc_command].command_type =
+        kNoOperation;
+  }
+  // we keep matrix m2 (so m2 is m_to_keep, m1 is m_to_discard).
+  DoMergeCommon(command_index, m2, m1);
+}
+
+void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
+                                             int32 m_to_keep,
+                                             int32 m_to_discard) {
   ComputationAnalysis analysis(*computation_, analyzer_);
   NnetComputation::Command &c = computation_->commands[command_index];
   const std::vector<MatrixAccesses> &matrix_accesses =
@@ -816,59 +852,52 @@ void VariableMergingOptimizer::DoMerge(int32 command_index,
     c.arg2 = -1;
   }
 
-  //   We want to ensure that there is only one deallocation command.
-  //   If neither matrix is an output, then there will be 2 deallocation
-  //   commands and we keep the one for m_to_keep (which, if the sizes
-  //   differ, will be the larger of the two, so it's the one whose
-  //   submatrix index refers to the entirety of the matrix).
-  //   If one of them is an output, then remove the deallocation command
-  //   of whichever one is not an output.
-  //   As a simplification to the logic above: if the 'discard' matrix
-  //   has a deallocation command (i.e. if that matrix was not an output)
-  //   then remove it; otherwise remove the deallocation command of
-  //   the 'keep' matrix.
-
+  //   - If both m_to_keep and m_to_discard have commands that deallocate them,
+  //    keep only the allocation command for m_to_keep, and make sure it's after
+  //    the last access of m_to_discard (otherwise delete any deallocation
+  //    command).
   int32 dealloc_keep = matrix_accesses[m_to_keep].deallocate_command,
       dealloc_discard = matrix_accesses[m_to_discard].deallocate_command;
-  if (dealloc_discard != -1) {
+  if (dealloc_keep != -1 && dealloc_discard != -1) {
+    KALDI_ASSERT(analysis.LastMatrixAccess(m_to_discard) < dealloc_keep);
     computation_->commands[dealloc_discard].command_type = kNoOperation;
   } else {
-    KALDI_ASSERT(dealloc_keep != -1);
-    computation_->commands[dealloc_keep].command_type = kNoOperation;
-  }
-
-  {
-    //   - Both m_to_keep and m_to_discard will have commands that allocate
-    //     them, as all matrices do (note, kAcceptInput counts as an allocation
-    //     command).  If one of them is kAcceptInput, then delete the other one.
-    //     Otherwise delete the "discard" one.  As a simplification of the logic
-    //     of the previous sentence: if the "discard" allocate command is
-    //     kAcceptInput then delete the "keep" allocate command, else delete
-    //     the "discard" allocate command.
-    //     Note: after we renumber the submatrices, they both refer to the
-    //     same underlying matrix, but we need to refer to them using a
-    //     submatrix that refers to the entire matrix.  The one we keep will
-    //     always refer to the entire matrix.  (In the case where one of
-    //     them is an input, both submatrices are guaranteed to refer to the
-    //     entire matrix).
-    int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
-        alloc_discard = matrix_accesses[m_to_discard].allocate_command;
-
-    KALDI_ASSERT(alloc_keep != -1 && alloc_discard != -1);
+    if (dealloc_keep != -1)
+      computation_->commands[dealloc_keep].command_type =
+          kNoOperation;
+    if (dealloc_discard != -1)
+      computation_->commands[dealloc_discard].command_type =
+          kNoOperation;
+  }
+
+  //   - If both m_to_keep and m_to_discard have commands that allocate them,
+  //     keep only the allocation command for m_to_keep and make sure it's
+  //     before the first access of m_to_discard.
+  //     (otherwise delete any allocation command).
+  int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
+      alloc_discard = matrix_accesses[m_to_discard].allocate_command;
+  if (alloc_keep != -1 && alloc_discard != -1) {
     KALDI_ASSERT(analysis.FirstMatrixAccess(m_to_discard) > alloc_keep);
-
     NnetComputation::Command
         &keep_alloc_command = computation_->commands[alloc_keep],
         &discard_alloc_command = computation_->commands[alloc_discard];
-    if (discard_alloc_command.command_type == kAcceptInput) {
-      keep_alloc_command.command_type = kNoOperation;
-    } else {
-      discard_alloc_command.command_type = kNoOperation;
+    discard_alloc_command.command_type = kNoOperation;
+    if (keep_alloc_command.command_type == kAllocMatrixUndefined) {
+      keep_alloc_command.command_type = kAllocMatrixZeroed;
+    } else if (keep_alloc_command.command_type == kAllocMatrixFromOther) {
+      keep_alloc_command.command_type = kAllocMatrixFromOtherZeroed;
     }
+  } else {
+    if (alloc_keep != -1)
+      computation_->commands[alloc_keep].command_type =
+          kNoOperation;
+    if (alloc_discard != -1)
+      computation_->commands[alloc_discard].command_type =
+          kNoOperation;
   }
 
   //  If the matrix to discard had stride_type == kStrideEqualNumCols, set the
-  //  matrix to keep's stride_type to kStrideEqualNumCols.
+  //  matrix to keep's stride_type to kStrideEqualNuMCols.
   if (computation_->matrices[m_to_discard].stride_type == kStrideEqualNumCols) {
     computation_->matrices[m_to_keep].stride_type = kStrideEqualNumCols;
     // ... and perform an additional check.
@@ -879,6 +908,43 @@ void VariableMergingOptimizer::DoMerge(int32 command_index,
   }
 }
 
+void VariableMergingOptimizer::DoLeftMerge(int32 command_index,
+                                           int32 s1, int32 s2) {
+  // Prevent further optimizations touching s1 or s2 (we can
+  // try again in a later round of optimization, with a new
+  // instance of this class).
+  MarkAsDirty(s1);
+  MarkAsDirty(s2);
+
+  int32 m1 = computation_->submatrices[s1].matrix_index,
+      m2 = computation_->submatrices[s2].matrix_index;
+  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
+  { // modify submatrices for submatrices of m2 to effectively be sub-matrices of
+    // s1 instead (they will refer to m1 as the matrix_index).
+    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m2].begin(),
+        end = matrix_to_submatrix_[m2].end();
+    for (; iter != end; ++iter) {
+      int32 submatrix_index = *iter;
+      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m2);
+      computation_->submatrices[submatrix_index] =
+          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s1);
+    }
+  }
+  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
+  // - If m2 was an output, replace it as an input with m1.
+  bool replaced = ReplaceInOutput(nnet_, m2, m1, computation_);
+  KALDI_ASSERT(replaced == matrix_accesses[m2].is_output);
+  if (replaced) {  // Remove the command that deallocates m1.
+    int32 dealloc_command = matrix_accesses[m1].deallocate_command;
+    KALDI_ASSERT(dealloc_command != -1);
+    computation_->commands[dealloc_command].command_type =
+        kNoOperation;
+  }
+  // we keep matrix m1 (so m1 is m_to_keep, m2 is m_to_discard).
+  DoMergeCommon(command_index, m1, m2);
+}
+
+
 
 
 std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
@@ -949,77 +1015,6 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
   return std::pair<bool,bool>(false,false);
 }
 
-
-/** This class is responsible for consolidating the model-update part of
-    backprop commands, for components in (e.g.) recurrent networks that need to
-    have many separate backprop commands, into more efficient single commands
-    operating on consolidated data in larger matrices.  This is useful for
-    recurrent networks.  */
-class ModelUpdateConsolidator {
- public:
-  ModelUpdateConsolidator(const Nnet &nnet,
-                          NnetComputation *computation);
-  void ConsolidateModelUpdate();
- private:
-  void ConsolidateUpdateForComponent(
-      int32 component,
-      const std::vector<int32> &backprop_commands);
-
-  /// This function, called at the end of ConsolidateModelUpdate(), takes the
-  /// commands that we have put in extra_commands_, final_commands_ and
-  /// final_deallocate_commands_, and puts them in the appropriate place in
-  /// computation->commands_.
-  void AddCommandsToComputation();
-
-  /// You call this function when you want to consolidate the values of a list
-  /// of submatrices taken just prior to particular commands.  The input
-  /// 'commands' and 'submatrices' lists must be the same size, and size must be
-  /// > 1.  This function will create a new matrix that is the row-wise
-  /// concatentation of all these submatrices, with values taken just prior to
-  /// the respective command indexes.  This function will will add to
-  /// extra_commands_ the commands to do the copying at the appropriate places
-  /// (at the supplied command indexes; they will be inserted just before).  The
-  /// return value is the submatrix index of a submatrix that represents the
-  /// whole of the consolidated matrix.  This command will insert, at the
-  /// beginning of the computation (in extra_commands_[0]), a command to
-  /// initialize the matrix; and will append to final_deallocate_commands_ the
-  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
-  /// nonempty, this function will also update computation_->matrix_debug_info
-  /// with suitable values for the newly added matrix
-  int32 ConsolidateSubmatrices(
-      const std::vector<int32> &commands,
-      const std::vector<int32> &submatrices);
-
-  /// This function, called from ConsolidateSubmatrices, will
-  /// update 'debug_info' by appending the corresponding 'indexes' from
-  /// the existing debug info for this submatrix.  It will also set
-  /// the 'is_deriv' of '*debug_info' to the same value as the
-  /// debug info for 'submatrix_index', and set the 'node_index' to the
-  /// 'node_index' in the debug info for that submatrix-index.
-  /// It requires that computation_->matrix_debug_info be nonempty.
-  void AppendDebugInfoForSubmatrix(
-      int32 submatrix_index,
-      NnetComputation::MatrixDebugInfo *debug_info) const;
-
-  const Nnet &nnet_;
-  NnetComputation *computation_;
-
-  // Indexed by the original command index in *computation_ (and sized to the
-  // original number of commands in *computation_ before we added anything),
-  // extra_commands_[c] contains a list of commands that need to be inserted
-  // just before command c in the previously existing computation.
-  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
-
-  // This is as list of kBackprop commands that will be placed after the
-  // commands in 'computation_->commands' and 'extra_commands_', but before
-  // the 'final_deallocate_commands_'.
-  std::vector<NnetComputation::Command> final_commands_;
-  // This is a list of commands to deallocate our 'consolidated' matrices; the
-  // commands will be placed after the commands in 'final_commands_'.
-  std::vector<NnetComputation::Command> final_deallocate_commands_;
-};
-
-
 void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
     int32 submatrix_index,
     NnetComputation::MatrixDebugInfo *debug_info) const {
@@ -1043,6 +1038,7 @@ void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
                              src_info.cindexes.begin() + row_end);
 }
 
+
 // see comment by declaration in header.
 int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     const std::vector<int32> &commands,
@@ -1071,14 +1067,14 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
   int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols,
                                                       stride_type);
   // Add a command at the very start, to initialize this new matrix.
+  int32 new_matrix_index =
+      computation_->submatrices[new_whole_submatrix].matrix_index;
   // we can later on optimize this zeroed initialization to an undefined
   // initialization.
   extra_commands_[0].push_back(
-      NnetComputation::Command(kAllocMatrixZeroed, new_whole_submatrix));
+      NnetComputation::Command(kAllocMatrixZeroed, new_matrix_index));
   final_deallocate_commands_.push_back(
-      NnetComputation::Command(kDeallocMatrix, new_whole_submatrix));
-  int32 new_matrix_index =
-      computation_->submatrices[new_whole_submatrix].matrix_index;
+      NnetComputation::Command(kDeallocMatrix, new_matrix_index));
   if (!computation_->matrix_debug_info.empty())
     computation_->matrix_debug_info[new_matrix_index].Swap(&debug_info);
 
@@ -1095,7 +1091,7 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     // submatrix numbered 'new_submatrix' the contents of the submatrix numbered
     // 'submatrices[i]'.  Note: we hope that a later pass of optimization
     // (VariableMergingOptimization) will remove this redundant copy by
-    // having the operation that created it write directly to the location
+    // having the operation that created it right directly to the location
     // we want it to be.
     NnetComputation::Command c(kMatrixCopy, new_submatrix, submatrices[i]);
     extra_commands_[commands[i]].push_back(c);
@@ -1216,19 +1212,6 @@ void ModelUpdateConsolidator::ConsolidateModelUpdate() {
   AddCommandsToComputation();
 }
 
-
-void ConsolidateModelUpdate(const Nnet &nnet,
-                            NnetComputation *computation) {
-  // This following if-statement is an optimization: if the computation
-  // request(s) had need_model_derivative == false, there would be nothing to
-  // optimize, so don't bother trying.
-  if (!computation->need_model_derivative)
-    return;
-  ModelUpdateConsolidator consolidator(nnet, computation);
-  consolidator.ConsolidateModelUpdate();
-}
-
-
 // inline
 void DerivativeTimeLimiter::GetPruneValues(int32 initial_submatrix,
                                            int32 new_submatrix,
@@ -1312,8 +1295,8 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
         command->arg5 = mapped_output_deriv_submatrix;
         command->arg6 = mapped_input_deriv_submatrix;
       }
-      break;
     }
+      break;
     case kMatrixCopy: case kMatrixAdd:
       MapSimpleMatrixCommand(command);
       break;
@@ -1328,7 +1311,6 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
       MapAddRowRangesCommand(command);
       break;
     }
-    case kAcceptInput: case kProvideOutput:
     case kNoOperation: case kNoOperationMarker:
       break;
     default:
@@ -1351,7 +1333,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     c->command_type = kNoOperation;
     return;
   }
-  // left_prune1 is the number of rows pruned away on the left for submatrix1.
+  // left_prune1 is the nmber of rows pruned away on the left for submatrix1.
   int32 orig_num_rows = computation_->submatrices[submatrix1].num_rows,
       left_prune1, left_prune2, right_prune1, right_prune2;
   GetPruneValues(submatrix1, submatrix1_mapped, &left_prune1, &right_prune1);
@@ -1373,7 +1355,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     } else {
       int32 num_rows = orig_num_rows - left_prune - right_prune;
       // note: the call NewSubMatrix effectively gives us a sub-matrix of a
-      // sub-matrix.
+      // subm-matrix.
       c->arg1 = computation_->NewSubMatrix(submatrix1,
                                            left_prune, num_rows, 0, -1);
       c->arg2 = computation_->NewSubMatrix(submatrix2,
@@ -1583,7 +1565,7 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
       max_deriv_time_ == std::numeric_limits<BaseFloat>::max())
     return;  // nothing to do.
 
-  computation_->GetWholeSubmatrices(&whole_submatrices_);
+  EnsureMatricesHaveEntireSubmatrices();
   ComputeMatrixPruneInfo();
   ComputeSubmatrixMaps();
   ModifyCommands();
@@ -1592,6 +1574,20 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
   RenumberComputation(computation_);
 }
 
+void DerivativeTimeLimiter::EnsureMatricesHaveEntireSubmatrices() {
+  int32 num_matrices = computation_->matrices.size(),
+      num_submatrices = computation_->submatrices.size();
+  entire_submatrix_.clear();
+  entire_submatrix_.resize(num_matrices, -1);
+  entire_submatrix_[0] = 0;
+  for (int32 s = 1; s < num_submatrices; s++)
+    if (computation_->IsWholeMatrix(s))
+      entire_submatrix_[computation_->submatrices[s].matrix_index] = s;
+  for (int32 m = 1; m < num_matrices; m++)
+    if (entire_submatrix_[m] == -1)
+      entire_submatrix_[m] = computation_->NewSubMatrix(m, 0, -1, 0, -1);
+}
+
 void DerivativeTimeLimiter::ComputeMatrixPruneInfo() {
   KALDI_ASSERT(computation_->matrix_debug_info.size() ==
                computation_->matrices.size() &&
@@ -1692,20 +1688,20 @@ void DerivativeTimeLimiter::ModifyCommands() {
 // desired range are never accessed), and false otherwise.
 bool DerivativeTimeLimiter::CanLimitMatrix(const Analyzer &analyzer,
                                            int32 m) const {
-  int32 s_whole = whole_submatrices_[m];  // submatrix consisting of
+  int32 s_entire = entire_submatrix_[m];  // submatrix consisting of
                                                      // all of the matrix.
-  int32 s_mapped = submatrix_map_[s_whole];  // the matrix limited in time.
-  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_whole);
-  std::vector<int32> whole_variables, mapped_variables;
-  analyzer.variables.AppendVariablesForSubmatrix(s_whole,
-                                                 &whole_variables);
+  int32 s_mapped = submatrix_map_[s_entire];  // the matrix limited in time.
+  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_entire);
+  std::vector<int32> entire_variables, mapped_variables;
+  analyzer.variables.AppendVariablesForSubmatrix(s_entire,
+                                                 &entire_variables);
   analyzer.variables.AppendVariablesForSubmatrix(s_mapped,
                                                  &mapped_variables);
-  KALDI_ASSERT(whole_variables.size() > mapped_variables.size());
-  std::vector<int32> excluded_variables(whole_variables.size() -
+  KALDI_ASSERT(entire_variables.size() > mapped_variables.size());
+  std::vector<int32> excluded_variables(entire_variables.size() -
                                         mapped_variables.size());
   std::vector<int32>::iterator end_iter =
-      std::set_difference(whole_variables.begin(), whole_variables.end(),
+      std::set_difference(entire_variables.begin(), entire_variables.end(),
                           mapped_variables.begin(), mapped_variables.end(),
                           excluded_variables.begin());
   KALDI_ASSERT(end_iter == excluded_variables.end());
@@ -1754,24 +1750,15 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
         // rows to the left.
         submat_info.row_offset = new_row_begin;
       } else {
-        // This submatrix is not entirely inside the kept range of the matrix.
-        // We assume that this submatrix is never accessed directly except (if
-        // it was the whole matrix) for in allocation and deallocation commands,
-        // since when we modified the computation we ensured this.
-        if (computation_->IsWholeMatrix(s)) {
-          // If it was the whole matrix then it may be used in allocation and
-          // deallocation commands, so we should modify it to be the whole of the
-          // new matrix, which will have fewer rows than before.
-          submat_info.num_rows = matrix_num_rows;
-        } else {
-          // We believe this matrix should never be used.  We give it a valid
-          // but stupid size of num-rows=1, num-cols=1, so that if it ever does
-          // get accessed it should produce an error.
-          submat_info.row_offset = 0;
-          submat_info.num_rows = 1;
-          submat_info.col_offset = 0;
-          submat_info.num_cols = 1;
-        }
+        // This submatrix is not entirely the kept range of the matrix.
+        // We assume that this submatrix is never accessed directly (as when
+        // we modified the computation we ensured this).  We
+        // give it a valid but stupid size of num-rows=1, num-cols=1, so
+        // that if it ever does get accessed it should produce an error.
+        submat_info.row_offset = 0;
+        submat_info.num_rows = 1;
+        submat_info.col_offset = 0;
+        submat_info.num_cols = 1;
       }
     }
   }
@@ -1798,7 +1785,7 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
 void DerivativeTimeLimiter::PruneMatrices() {
   Analyzer analyzer;
   analyzer.Init(nnet_, *computation_);
-  KALDI_ASSERT(computation_->matrices.size() == whole_submatrices_.size());
+  KALDI_ASSERT(computation_->matrices.size() == entire_submatrix_.size());
   int32 num_matrices = computation_->matrices.size();
   std::vector<bool> will_limit(num_matrices, false);
   bool will_limit_at_least_one = false;
@@ -1843,6 +1830,22 @@ void DerivativeTimeLimiter::PruneMatrices() {
 }
 
 
+int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
+  int32 ans = std::numeric_limits<int32>::min();
+  for (size_t i = 0; i < request.outputs.size(); i++) {
+    std::vector<Index> indexes &indexes = request.outputs[i].indexes;
+    std::vector<Index> indexes::const_iterator iter = indexes.begin(),
+        end = indexes.end();
+    for (; iter != end; ++iter)
+      if (iter.t > ans)
+        ans = iter.t;
+  }
+  if (ans == std::numeric_limits<int32>::min()) {
+    KALDI_ERR << "Failed to find any output indexes in computation request.";
+  }
+  return ans;
+}
+
 void LimitDerivativeTimes(const Nnet &nnet,
                           int32 min_deriv_time,
                           int32 max_deriv_time,
@@ -1852,863 +1855,5 @@ void LimitDerivativeTimes(const Nnet &nnet,
   limiter.LimitDerivTimes();
 }
 
-
-/*
-  This helper function, used in ReplaceRowWithMatrixOps, detects
-  when the vector 'indexes' has a 'special structure'.  The special structure
-  is:
-    zero or more -1's, then
-    a consecutive nonempty sequence of nonnegative numbers, e.g. 6 7 8 9 10, then
-    zero or more -1's.
-
-  Note: this function assumes that any negative elements of 'indexes' are -1.
-  If there are elements less than -1, then it is an error, but this function
-  does not thoroughly check for that.  'indexes' is required to be a nonempty
-  vector.
-
-  If 'indexes' has the special structure then this function returns true
-  and sets the following values, which will explain with the following
-  example in mind: 'indexes = [ -1, -1, 5 6 7 8, -1 ]'.
-     - '*first_nonnegative_pos' is set to the number of initial -1's (and also
-       the location of the first nonnegative element): 2 in this case.
-     - '*first_nonnegative_value' is set to the value of the first nonnegative
-       element (5 in this case)
-     - '*num_nonnegative_values' is set to the number of nonnegative values in
-       the sequence (4 in this case).
-  If 'indexes' does not have this special structure, then this function returns
-  false, and the values of '*first_nonnegative_pos',
-  '*first_nonnegative_value' and '*num_nonnegative_indexes' on exit are
-  undefined.
-*/
-static bool IndexesHaveSpecialStructure(const std::vector<int32> &indexes,
-                                        int32 *first_nonnegative_pos,
-                                        int32 *first_nonnegative_value,
-                                        int32 *num_nonnegative_indexes) {
-  KALDI_ASSERT(!indexes.empty());
-  const int32 *indexes_ptr = &(indexes[0]);
-  size_t pos = 0, size = indexes.size();
-
-  // Find the first nonnegative element of 'indexes'.
-  for (; pos < size; ++pos)
-    if (indexes_ptr[pos] >= 0)
-      break;
-  if (pos == size)
-    return false;  // all -1's... should not happen, but not our problem.
-  *first_nonnegative_pos = static_cast<int32>(pos);
-  int32 n = indexes_ptr[pos];
-  *first_nonnegative_value = n;
-  // Find the first element after '*first_nonnegative_index' that isn't
-  // consecutive.
-  for (; pos < size; ++pos,++n)
-    if (indexes_ptr[pos] != n)
-      break;
-
-  *num_nonnegative_indexes = n - *first_nonnegative_value;
-
-  // Check that the remaining values are all <0 (assumed equal to -1, but
-  // checking <0 may be faster as just one instruction).
-  for (; pos < size; ++pos)
-    if (indexes_ptr[pos] >= 0)
-      return false;  // does not have the special structure.
-
-  return true;
-}
-
-
-
-bool ReplaceRowWithMatrixOps(NnetComputation *computation) {
-  bool ans = false;
-  int32 num_commands = computation->commands.size(),
-      num_indexes = computation->indexes.size();
-  for (int32 command_index = 0; command_index < num_commands;
-       command_index++) {
-    // non-const because we'll be changing it.
-    NnetComputation::Command &c = computation->commands[command_index];
-
-    int32 first_nonnegative_pos,
-        first_nonnegative_value,
-        num_nonnegative_indexes;
-    switch (c.command_type) {
-      case kCopyRows: case kAddRows: {
-        int32 indexes_index = c.arg3;
-        KALDI_ASSERT(indexes_index < num_indexes);
-        const std::vector<int32> &indexes = computation->indexes[indexes_index];
-        if (IndexesHaveSpecialStructure(indexes,
-                                        &first_nonnegative_pos,
-                                        &first_nonnegative_value,
-                                        &num_nonnegative_indexes)) {
-          ans = true;
-          c.arg1 = computation->NewSubMatrix(c.arg1, first_nonnegative_pos,
-                                             num_nonnegative_indexes,
-                                             0, -1);
-          c.arg2 = computation->NewSubMatrix(c.arg2, first_nonnegative_value,
-                                             num_nonnegative_indexes,
-                                             0, -1);
-          c.command_type = (c.command_type == kCopyRows ? kMatrixCopy :
-                            kMatrixAdd);
-        }
-        break;
-      }
-      default:
-        continue;
-    }
-  }
-  return ans;
-}
-
-// This class implements the internals of the ExpandComputation() function (used
-// in shortcut compilation); see comment by the declaration of
-// ExpandComputation() in nnet-optimize-utils.h for overview.
-class ComputationExpander {
- public:
-  ComputationExpander(const NnetComputation &computation,
-                      bool need_debug_info,
-                      int32 num_n_values,
-                      NnetComputation *expanded_computation):
-      computation_(computation),
-      need_debug_info_(need_debug_info),
-      num_n_values_(num_n_values),
-      expanded_computation_(expanded_computation) {
-    KALDI_ASSERT(num_n_values > 2);
-  }
-
-  // This function call implements the functionality of the class,
-  // expanding the computation.
-  bool Expand();
-
- private:
-  // This function sets up and computes the 'n_fast' vector (see comment
-  // by it for what this is.
-  void InitFastInfo();
-
-  // This function sets up the 'matrices' vector in 'expanded_computation_'.
-  // It's quite simple: it just multiplies all the num-rows by num_n_values_ and
-  // divides by 2, and leaves the num-cols the same.
-  void ComputeMatrices();
-
-  // This function, only called if need_debug_info_ is true, sets up
-  // the 'matrix_debug_info' vector in 'expanded_computation_'.
-  void ComputeDebugInfo();
-
-  // This function sets up the 'submatrices' vector in 'expanded_computation_'.
-  // Column ranges always stay the same, but for row ranges it's a little
-  // more complicated.
-  void ComputeSubmatrixInfo();
-
-
-  // This function computes all the PrecomputedIndexes in the
-  // 'component_precomputed_indexes' member of 'expanded_computation_'.
-  // They are all generated from scratch, by using the Component::PrecomputedIndexes()
-  // member function.  The 'input_indexes' and 'output_indexes' arguments are worked
-  // out from the 'debug_info' [if we're not generating debug_info we specially generate
-  // it for the specific matrices in question], and the 'need_backprop'
-  // argument is worked out by seeing whether there is a call to Backprop() with
-  // the same precomputed-indexes element.
-  void ComputePrecomputedIndexes();
-
-  // Computes the 'commands' member of the output.  This function also adds as
-  // needed to 'indexes', 'indexes_multi' and 'indexes_ranges' in the output.
-  // Later on we can call RenumberComputation() to remove any duplicates that
-  // might result from this.
-  void ComputeCommands();
-
-
-  // This 'n_fast' vector is indexed by the matrix-index in the computation,
-  // i.e. the same index as indexes computation_.matrix_info and
-  // expanded_computation_->matrix_info.  For each matrix-index m > 0 it
-  // contains true if the 'n' varies 'fast', or false if the 'n' index varies
-  // 'slowly'.  By 'fast' and 'slow', we mean in the same sense as is desribed
-  // in the comment for ComputationIsDecomposable() in nnet-optimize-utils.h.
-  std::vector<bool> n_fast;
-
-
-
-
-
-
-  const NnetComputation &computation_;
-  bool need_debug_info_;
-  int32 num_n_values_;
-  NnetComputation *expanded_computation_;
-};
-
-
-
-class ComputationLoopedOptimizer {
- public:
-  ComputationLoopedOptimizer(const Nnet &nnet,
-                             NnetComputation *computation):
-      nnet_(nnet), computation_(computation) { }
-  bool Optimize();
-
- private:
-
-  // Figures out the time shift between the successive computation requests.
-  static int32 FindTimeShift(const NnetComputation &computation,
-                             const std::vector<int32> &segment_ends);
-
-  // This function creates a mapping from a matrix-index > 0,
-  // to a pair (unique_id, time_offset) that represents the debug-info
-  // for that matrix-id in computation.debug_info.
-  // The output vector is indexed by the matrix-index in the computation (the
-  // zeroth member is not valid).  It requires that the
-  // The 'time_offset' is equal to the 't' value of the zeroth element of the
-  // cindexes vetor.  The 'unique_id' is an integer that uniquely identifies
-  // what we get from subtracting the 'time_offset' from each 't' value of
-  // that 'cindexes' vector, and then pairing it up with the 'is_deriv'
-  // value of the DebugInfo.  That is, if two 'cindexes' vectors differ only
-  // by a time offset, and the 'is_deriv' values are the same they will map to the same
-  // unique_id.
-  // The output 'matrix_to_pair' is indexed by matrix index (the zeroth element is
-  // not set).
-  static void CreateMatrixPairs(const NnetComputation &computation,
-                                std::vector<std::pair<int32, int32> > *matrix_to_pair);
-
-
-  // This very simple helper function reverses the map 'matrix_to_pair' so we can
-  // do the reverse lookup.  It outputs a map from pair to matrix index m, where
-  // 1 <= m < matrix_to_pair.size().
-  static void GetPairToMatrixMap(
-      std::vector<std::pair<int32, int32> > &matrix_to_pair,
-      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix);
-
-
-  // Given a vector of lists, one list for each segment, of the active matrices
-  // at the end of that segment, this function converts those lists into a
-  // different representation where each matrix is reprented as a pair instead
-  // of as a single int32.  'active_pairs' will have the same dimensions as
-  // 'active_matrices'.
-  static void ConvertListsToPairLists(
-      const std::vector<std::vector<int32> > &active_matrices,
-      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
-      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs);
-
-  // This function modifies the lists of active matrices per segment
-  // (represented as pairs) in 'active_pairs' by sorting them and
-  // then subtracting the time-offset of the first pair in each
-  // list ((*active_pair)[seg][0].second), from all elements in that list.
-  // It puts the subtracted offset in (*time_offsets)[seg].  This change
-  // of representation makes it easy to tell whether the sets of active
-  // matrices for different segments are identical up to a time-offset.
-  static void NormalizePairLists(
-      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
-      std::vector<int32> *time_offsets);
-
-  // This function looks in the matrix 'active_pairs' for the first pair of
-  // identical values, i.e. it is looking for i < j for which
-  // normalized_active_pairs[i] == normalized_active_pairs[j].  (However, the
-  // pair i,j must satisfy an extra condition, see below).  If a pair
-  // i,j exists satisfying these conditions, this function outputs them to *seg1
-  // and *seg2, and returns true; otherwise it returns false.
-  //
-  // Extra condition:
-  // It turns out that under some circumstances, we can
-  // fine repeats that were not "really" repeats (the matrices were not time
-  // shifted) The situation was a bit obscure (it was a non-recurrent setup with
-  // a lot of extra-right-context, where some inputs were never used), but to
-  // prevent it happening again we are now checking in addition to the above,
-  // that the time-shift between the segments (i.e. time_offsets[j] -
-  // time_offsets[i]), has the "expected value" based on the assumption that
-  // each segment should be shifted relative to the previous segment, by
-  // 'time_shift_per_segment'.
-  static bool FindFirstRepeat(
-      const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
-      const std::vector<int32> &time_offsets,
-      int32 time_shift_per_segment,
-      int32 *seg1, int32 *seg2);
-
-  // Converts a list of pairs (e.g. one of the elements of the output of
-  // 'ConvertListsToPairLists)', back into a list of matrix indexes, using the
-  // map 'pair_to_matrix'.
-  static void PairListToMatrixList(
-      const std::vector<std::pair<int32, int32> > &pair_list,
-      const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
-      std::vector<int32> *matrix_list);
-
-
-  // This function just does some checking (via asserts), that
-  // the lists of matrices 'list1' and 'list2' are of the same length,
-  // that time_difference > 0, that each matrix with index m = list2[i] is of the
-  // same dimension as the list1[i], with Cindexes that are the same except for
-  // the time index being greater by 'time_difference'
-  static void CheckIdentifiedMatrices(
-      const NnetComputation &computation,
-      const std::vector<int32> &list1,
-      const std::vector<int32> &list2,
-      int32 time_difference);
-
-
-  // Given two command indexes command1 < command2 pointing to commands of type
-  // kNoOperationMarker, this function modifies the computation by
-  // removing all commands after command2, replacing command2 with a kGotoLabel
-  // command pointing to command1  and then inserting just before command1
-  // a marker of type kNoOperationLabel.
-  static void FormInfiniteLoop(int32 command1, int32 command2,
-                               NnetComputation *computation);
-
-  // This is to be called after FormInfiniteLoop.  It inserts, just before
-  // the final kGotoLabel command, commands that initialize
-  // each of the matrices in list 'matrices1' from the corresponding
-  // matrix in 'matrices2', using the kAllocMatrixFromOther command.
-  // This effectively does, for example, matrices1[i] = matrices2[i],
-  // while initializing matrices1[i] and deallocating matrices2[i];
-  // it's implemented as a shallow swap.
-  // It does this in such an order that even if the two lists are
-  // not disjoint, the right thing happens.
-  static void AddMatrixSwapCommands(
-      const std::vector<int32> &matrices1,
-      const std::vector<int32> &matrices2,
-      NnetComputation *computation);
-
-
-  // Called from AddMatrixSwapCommands, this function figures out for us
-  // an acceptable order in which to execute the kAllocMatrixFromOther
-  // commands.  This is easy to do if matrices1 and matrices2 are disjoint
-  // sets, but has to be done more carefully if they overlap.
-  // The output is a list of pairs where each pair (a, b) comes from
-  // from matrices1 and matrices2 in the same position, i.e.
-  // a = matrices1[i] and b = matrices2[i].
-  static void GetMatrixSwapOrder(
-      const std::vector<int32> &matrices1,
-      const std::vector<int32> &matrices2,
-      std::vector<std::pair<int32, int32> > *swaps);
-
-
-
-  /// Given a list of command indexes ('segment_end_commands') which are
-  /// expected to be command indexes of the kNoOperationMarker at segment
-  /// boundaries, this function outputs for each of these command indexes a list
-  /// of matrices which are 'active' at that point in time.  By 'active' we mean
-  /// that the matrix has been written to before that time (note, we don't count
-  /// initialization with zeros as being written to); and will be read after
-  /// that time.  These is the list of matrices that 'need to be in scope'
-  /// at those points in time.  '*active_matrices' is indexed by the
-  /// same index as 'segment_end_commands', and is then a list of active
-  /// matrices, in numerical order of matrix index.
-  /// Note: for each i, (*active_matrices)[i] will be sorted and unique.
-  static void FindActiveMatrices(const NnetComputation &computation,
-                                 const Analyzer &analyzer,
-                                 const std::vector<int32> &segment_end_commands,
-                                 std::vector<std::vector<int32> > *active_matrices);
-
-
-  const Nnet &nnet_;
-  NnetComputation *computation_;
-  Analyzer analyzer_;
-  std::vector<std::pair<int32, int32> > matrix_to_pair_;
-
-  std::vector<int32> segment_end_commands_;
-};
-
-// static
-int32 ComputationLoopedOptimizer::FindTimeShift(
-    const NnetComputation &computation,
-    const std::vector<int32> &segment_ends) {
-  KALDI_ASSERT(segment_ends.size() >= 3);
-  // Ignore the first segment as it tends to be a special case
-  // (it has more left context),
-  int32 second_segment_begin = segment_ends[0],
-      third_segment_begin = segment_ends[1],
-      fourth_segment_begin = segment_ends[2];
-  int32 first_output_command_seg2 = -1,
-      first_output_command_seg3 = -1;
-  for (int32 c = second_segment_begin; c < third_segment_begin; c++)
-    if (computation.commands[c].command_type == kProvideOutput &&
-        first_output_command_seg2 < 0)
-      first_output_command_seg2 = c;
-  for (int32 c = third_segment_begin; c < fourth_segment_begin; c++)
-    if (computation.commands[c].command_type == kProvideOutput &&
-        first_output_command_seg3 < 0)
-      first_output_command_seg3 = c;
-  if (first_output_command_seg2 < 0 ||
-      first_output_command_seg3 < 0)
-    KALDI_ERR << "Could not locate output commands for segments 2 and 3.";
-  const NnetComputation::Command
-      &command2 = computation.commands[first_output_command_seg2],
-      &command3 = computation.commands[first_output_command_seg3];
-  int32 seg2_node = command2.arg2, seg3_node = command3.arg2;
-  KALDI_ASSERT(seg2_node == seg3_node);
-  int32 seg2_submatrix = command2.arg1,
-      seg3_submatrix = command3.arg1;
-  KALDI_ASSERT(computation.IsWholeMatrix(seg2_submatrix) &&
-               computation.IsWholeMatrix(seg3_submatrix));
-  int32 seg2_matrix = computation.submatrices[seg2_submatrix].matrix_index,
-      seg3_matrix = computation.submatrices[seg3_submatrix].matrix_index;
-  KALDI_ASSERT(computation.matrices[seg2_matrix].num_rows ==
-               computation.matrices[seg3_matrix].num_rows);
-  KALDI_ASSERT(!computation.matrix_debug_info.empty());
-  const NnetComputation::MatrixDebugInfo
-      &debug_info2 = computation.matrix_debug_info[seg2_matrix],
-      &debug_info3 = computation.matrix_debug_info[seg3_matrix];
-  int32 t_offset = debug_info3.cindexes[0].second.t -
-      debug_info2.cindexes[0].second.t;
-  int32 num_rows = debug_info2.cindexes.size();
-  for (int32 r = 0; r < num_rows; r++) {
-    KALDI_ASSERT(debug_info3.cindexes[r].second.t ==
-                 debug_info2.cindexes[r].second.t + t_offset);
-  }
-  return t_offset;
-}
-
-// static
-void ComputationLoopedOptimizer::CreateMatrixPairs(
-    const NnetComputation &computation,
-    std::vector<std::pair<int32, int32> > *matrix_to_pair) {
-  typedef unordered_map<std::vector<Cindex>, int32,
-                        CindexVectorHasher> MapType;
-  int32 cur_vector_id = 1;
-  // Note: cindex_map just maps the vector<Cindex> to a unique value,
-  // and then we manually work out a unique id that takes into
-  // account the 'is_deriv' values.
-  MapType cindex_map;
-  int32 num_matrices = computation.matrices.size();
-  matrix_to_pair->resize(num_matrices);
-  KALDI_ASSERT(computation.matrix_debug_info.size() == num_matrices);
-  for (int32 m = 1; m < num_matrices; m++) {
-    KALDI_ASSERT(!computation.matrix_debug_info[m].cindexes.empty());
-    std::vector<Cindex> cindexes = computation.matrix_debug_info[m].cindexes;
-    int32 t_offset = cindexes[0].second.t;
-    for (std::vector<Cindex>::iterator iter = cindexes.begin();
-         iter != cindexes.end(); ++iter)
-      iter->second.t -= t_offset;
-    MapType::const_iterator iter = cindex_map.find(cindexes);
-    int32 vector_id;
-    if (iter != cindex_map.end()) {
-      vector_id = iter->second;
-    } else {
-      vector_id = cur_vector_id++;
-      cindex_map[cindexes] = vector_id;
-    }
-    bool is_deriv = computation.matrix_debug_info[m].is_deriv;
-    int32 unique_id = 2 * vector_id + (is_deriv ? 1 : 0);
-    (*matrix_to_pair)[m].first = unique_id;
-    (*matrix_to_pair)[m].second = t_offset;
-  }
-}
-
-// static
-void ComputationLoopedOptimizer::GetPairToMatrixMap(
-      std::vector<std::pair<int32, int32> > &matrix_to_pair,
-      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix) {
-  int32 num_matrices = matrix_to_pair.size();
-  // actually there are one fewer matrices than num_matrices.
-  pair_to_matrix->clear();
-  for (int32 m = 1; m < num_matrices; m++)
-    (*pair_to_matrix)[matrix_to_pair[m]] = m;
-}
-
-
-// static
-void ComputationLoopedOptimizer::ConvertListsToPairLists(
-      const std::vector<std::vector<int32> > &active_matrices,
-      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
-      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs) {
-  active_pairs->clear();
-  active_pairs->resize(active_matrices.size());
-  int32 num_matrices = matrix_to_pair.size();
-  for (size_t seg = 0; seg < active_matrices.size(); seg++) {
-    const std::vector<int32> &this_active_matrix_list = active_matrices[seg];
-    std::vector<std::pair<int32, int32> > &this_active_pair_list =
-        (*active_pairs)[seg];
-    this_active_pair_list.resize(this_active_matrix_list.size());
-    std::vector<int32>::const_iterator iter = this_active_matrix_list.begin(),
-        end = this_active_matrix_list.end();
-    std::vector<std::pair<int32, int32> >::iterator
-        out_iter = this_active_pair_list.begin();
-    for (; iter != end; ++iter, ++out_iter) {
-      KALDI_ASSERT(*iter > 0 && *iter < num_matrices);
-      *out_iter = matrix_to_pair[*iter];
-    }
-  }
-}
-
-// static
-void ComputationLoopedOptimizer::NormalizePairLists(
-    std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
-    std::vector<int32> *time_offsets) {
-  int32 num_segments = active_pairs->size();
-  time_offsets->resize(num_segments);
-  for (int32 seg = 0; seg < num_segments; seg++) {
-    std::vector<std::pair<int32, int32> > &this_pairs = (*active_pairs)[seg];
-    std::sort(this_pairs.begin(), this_pairs.end());
-    int32 this_offset;
-    if (!this_pairs.empty()) {
-      this_offset = this_pairs[0].second;
-    } else {
-      // if this_pairs is empty, produce arbitrary offsets that are increasing
-      // (this will keep some self-testing code happy).
-      if (seg == 0) { this_offset = 0; }
-      else { this_offset = (*time_offsets)[seg - 1] + 1; }
-    }
-    (*time_offsets)[seg] = this_offset;
-    std::vector<std::pair<int32, int32> >::iterator
-        iter = this_pairs.begin(), end = this_pairs.end();
-    for (; iter != end; ++iter)
-      iter->second -= this_offset;
-  }
-}
-
-
-// static
-bool ComputationLoopedOptimizer::FindFirstRepeat(
-    const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
-    const std::vector<int32> &time_offsets,
-    int32 time_shift_per_segment,
-    int32 *seg1, int32 *seg2) {
-  int32 num_segments = normalized_active_pairs.size();
-  // This algorithm may seem like it would be very slow, but the number of
-  // segments will normally be quite small (e.g. 10), and the comparison of
-  // elements of 'normalized_active_pairs' should be fast in cases where they
-  // differ.
-  KALDI_ASSERT(num_segments >= 2);
-
-  bool perform_time_offset_check = true;
-  if (normalized_active_pairs.back().empty()) {
-    // If there are no variables active after the end of the last-but-one segment
-    // (which is the last element in segment_ends, since we remove the end of the
-    // very last segment), then don't perform the check related to
-    // time-offsets, it's not relevant.  [this would probably be a computation
-    // that doesn't require any context].
-    perform_time_offset_check = false;
-  }
-  for (int32 s = 0; s < num_segments; s++) {
-    for (int32 t = s + 1; t < num_segments; t++) {
-      if ((!perform_time_offset_check ||
-           time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) &&
-          normalized_active_pairs[s] == normalized_active_pairs[t]) {
-        *seg1 = s;
-        *seg2 = t;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// static
-void ComputationLoopedOptimizer::PairListToMatrixList(
-    const std::vector<std::pair<int32, int32> > &pair_list,
-    const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
-    std::vector<int32> *matrix_list) {
-  matrix_list->resize(pair_list.size());
-  std::vector<std::pair<int32, int32> >::const_iterator
-      iter = pair_list.begin(), end = pair_list.end();
-  std::vector<int32>::iterator out_iter = matrix_list->begin();
-  for (; iter != end; ++iter, ++out_iter) {
-    unordered_map<std::pair<int32, int32>, int32,
-                  PairHasher<int32> >::const_iterator
-        map_iter = pair_to_matrix.find(*iter);
-    if (map_iter == pair_to_matrix.end()) {
-      KALDI_ERR << "Could not find pair in map (code error)";
-    }
-    *out_iter = map_iter->second;
-  }
-}
-
-
-
-// static
-void ComputationLoopedOptimizer::FindActiveMatrices(
-    const NnetComputation &computation,
-    const Analyzer &analyzer,
-    const std::vector<int32> &segment_end_commands,
-    std::vector<std::vector<int32> > *active_matrices) {
-  int32 num_matrices = computation.matrices.size();
-  int32 num_segments = segment_end_commands.size();
-  active_matrices->clear();
-  active_matrices->resize(num_segments);
-  // this object just makes available some extra functions, vs. the Analyzer
-  // object.
-  ComputationAnalysis analysis(computation, analyzer);
-  KALDI_ASSERT(IsSortedAndUniq(segment_end_commands));
-
-  // the following vector gives us, for each matrix index, a submatrix index
-  // that covers the whole of that matrix (needed by interface of 'analysis' object).
-  std::vector<int32> whole_submatrices;
-  computation.GetWholeSubmatrices(&whole_submatrices);
-  for (int32 m = 1; m < num_matrices; m++) {
-    // the following are command indexes, comparable with the indexes
-    // in 'segment_end_commands'.
-    int32 s = whole_submatrices[m],  // submatrix consisting of the whole of
-                                     // 'm'.
-        first_access = analysis.FirstAccess(s),
-        last_access = analysis.LastAccess(s);
-    for (int32 seg = 0; seg < num_segments; seg++) {
-      int32 segment_end = segment_end_commands[seg];
-      if (first_access < segment_end && last_access > segment_end) {
-        // If the block of time during which the matrix is accessed, includes
-        // this segment end-point, then the matrix is considered 'active' at
-        // that time.
-        (*active_matrices)[seg].push_back(m);
-      }
-    }
-  }
-}
-
-// static
-void ComputationLoopedOptimizer::CheckIdentifiedMatrices(
-    const NnetComputation &computation,
-    const std::vector<int32> &list1,
-    const std::vector<int32> &list2,
-    int32 time_difference) {
-  KALDI_ASSERT(time_difference > 0);
-  KALDI_ASSERT(list1.size() == list2.size());
-  KALDI_ASSERT(!computation.matrix_debug_info.empty());
-  for (size_t i = 0; i < list1.size(); i++) {
-    int32 m1 = list1[i], m2 = list2[i];
-    const NnetComputation::MatrixInfo
-        &matrix_info1 = computation.matrices[m1],
-        &matrix_info2 = computation.matrices[m2];
-    KALDI_ASSERT(matrix_info1.num_rows == matrix_info2.num_rows &&
-                 matrix_info1.num_cols == matrix_info2.num_cols &&
-                 matrix_info1.stride_type == matrix_info2.stride_type);
-    const NnetComputation::MatrixDebugInfo
-        &debug_info1 = computation.matrix_debug_info[m1],
-        &debug_info2 = computation.matrix_debug_info[m2];
-    KALDI_ASSERT(debug_info1.is_deriv == debug_info2.is_deriv);
-    KALDI_ASSERT(debug_info1.cindexes.size() == debug_info2.cindexes.size());
-    std::vector<Cindex>::const_iterator iter1 = debug_info1.cindexes.begin(),
-        end1 = debug_info1.cindexes.end(),
-        iter2 = debug_info2.cindexes.begin();
-    for (; iter1 != end1; iter1++,iter2++) {
-      KALDI_ASSERT(iter2->first == iter1->first &&
-                   iter2->second.n == iter1->second.n &&
-                   iter2->second.t == iter1->second.t + time_difference &&
-                   iter2->second.x == iter1->second.x);
-    }
-  }
-}
-
-
-// static
-void ComputationLoopedOptimizer::GetMatrixSwapOrder(
-    const std::vector<int32> &matrices1,
-    const std::vector<int32> &matrices2,
-    std::vector<std::pair<int32, int32> > *swaps) {
-  KALDI_ASSERT(matrices1.size() == matrices2.size());
-  swaps->clear();
-  int32 num_matrices = matrices1.size();
-  std::vector<bool> processed(num_matrices, false);
-  std::vector<int32> queue;
-
-  // num_loops is just for infinite-loop detection.
-  int32 num_loops = 0;
-  for (; static_cast<int32>(swaps->size()) < num_matrices; num_loops++) {
-    for (int32 i = 0; i < num_matrices; i++) {
-      if (processed[i])
-        continue;
-      int32 m1 = matrices1[i], m2 = matrices2[i];
-      std::vector<int32>::const_iterator iter =
-          std::lower_bound(matrices2.begin(), matrices2.end(), m1);
-      if (iter == matrices2.end() || *iter != m1) {
-        // Matrix m1 does not appear in the list 'matrices2', so
-        // we are safe to process it at any time.
-        swaps->push_back(std::pair<int32,int32>(m1, m2));
-        processed[i] = true;
-      } else {
-        int32 m1_pos_in_matrices2 = iter - matrices2.begin();
-        if (processed[m1_pos_in_matrices2]) {
-          // We're safe to do this swap now, because the matrix m1 has already
-          // appeared on the RHS of a swap, and by this point has been
-          // deallocated, in effect.
-          swaps->push_back(std::pair<int32,int32>(m1, m2));
-          processed[i] = true;
-        }
-        // else do nothing, we cannot process m1 yet because
-        // at this point in the computation it is still allocated.
-      }
-    }
-    // The following assert is to check that we don't loop infinitely.  We can
-    // prove that infinite looping won't happen, after on proving that there can
-    // be no cycles like (m1, m2), (m2, m3), (m3, m1) (the length of 3 is chosen
-    // arbitrarily as an example).  If such a cycle existed, we can reach a
-    // contradiction based on the time-index (t) of the first cindex in m1.
-    // Define t1 = that time index, t2 the same for m2, t3 the same for m3.  The
-    // existence of the three pairs [as pairs like (matrices1[i], matrices2[i])]
-    // implies that t2 > t1, t3 > t2, and t1 > t3 respectively, but this is
-    // impossible.
-    // This shows that all chains of dependencies must terminate.
-    KALDI_ASSERT(num_loops <= num_matrices);
-  }
-}
-
-// static
-void ComputationLoopedOptimizer::AddMatrixSwapCommands(
-    const std::vector<int32> &matrices1,
-    const std::vector<int32> &matrices2,
-    NnetComputation *computation) {
-  std::vector<std::pair<int32, int32> > swaps;
-  // Note: in 'easy' cases where matrices1 and matrices2 are disjoint,
-  // 'swaps' will just be the vector { (matrices1[0],matrices2[0]),
-  // (matrices1[1],matrices2[1]), ... },
-  // but in some cases these may need to get reordered.
-  GetMatrixSwapOrder(matrices1, matrices2, &swaps);
-
-  NnetComputation::Command goto_label_command = computation->commands.back();
-  KALDI_ASSERT(goto_label_command.command_type == kGotoLabel);
-  computation->commands.pop_back();
-
-  // the following vector gives us, for each matrix index, a submatrix index
-  // that covers the whole of that matrix (needed because the commands
-  // require submatrix indexes)
-  std::vector<int32> whole_submatrices;
-  computation->GetWholeSubmatrices(&whole_submatrices);
-  size_t num_matrices = whole_submatrices.size();
-
-  for (size_t i = 0; i < swaps.size(); i++) {
-    int32 m1 = swaps[i].first, m2 = swaps[i].second;
-    KALDI_ASSERT(static_cast<size_t>(m1) < num_matrices &&
-                 static_cast<size_t>(m2) < num_matrices);
-    int32 s1 = whole_submatrices[m1], s2 = whole_submatrices[m2];
-    computation->commands.push_back(
-        NnetComputation::Command(
-            kAllocMatrixFromOther, s1, s2));
-  }
-  computation->commands.push_back(goto_label_command);
-}
-
-// static
-void ComputationLoopedOptimizer::FormInfiniteLoop(
-    int32 command1, int32 command2,
-    NnetComputation *computation) {
-  KALDI_ASSERT(static_cast<int32>(computation->commands.size()) >=
-               command2 + 1 && command1 < command2);
-  KALDI_ASSERT(
-      computation->commands[command1].command_type == kNoOperationMarker &&
-      computation->commands[command2].command_type == kNoOperationMarker);
-  // Remove any commands after 'command2'.
-  computation->commands.resize(command2 + 1);
-  computation->commands[command2].command_type = kGotoLabel;
-  computation->commands[command2].arg1 = command1;
-  NnetComputation::Command c(kNoOperationLabel);
-  computation->commands.insert(computation->commands.begin() + command1,
-                               c);
-  // Now the kNoOperationLabel command is at position 'command1'.
-}
-
-
-
-bool ComputationLoopedOptimizer::Optimize() {
-  analyzer_.Init(nnet_, *computation_);
-  KALDI_ASSERT(!computation_->matrix_debug_info.empty() &&
-               "You must request matrix debug info when compiling "
-               "looped computations.");
-
-  // get the indexes of the separator commands at the ends of segments.
-  std::vector<int32> segment_ends;
-  GetSegmentEnds(*computation_, &segment_ends);
-  int32 time_shift_per_segment = FindTimeShift(*computation_,
-                                               segment_ends);
-
-  // Ignore the end of the very last segment- it is not a candidate for a
-  // 'splice point'.  What we're doing here is like creating a tape loop; we
-  // have to find a place where the list of variables is the same except for a
-  // time offset.
-  // [note: it's not exactly like a tape loop because the prologue can
-  // vary... the sequence is of the form like a b b b b b .. ]
-  segment_ends.pop_back();
-
-
-  std::vector<std::vector<int32> > active_matrices;
-  // Find the list of matrices active at each of those segment-end-command
-  // times.
-  FindActiveMatrices(*computation_, analyzer_, segment_ends,
-                     &active_matrices);
-
-  // Find a representation of the matrices of the computation as pairs
-  // (unique_id, time_offset) that are more amenable to finding
-  // matrices that represet lists of Cindexes that differ only by
-  // a time offset.
-  std::vector<std::pair<int32, int32> > matrix_to_pair;
-  CreateMatrixPairs(*computation_, &matrix_to_pair);
-
-  // Create the reverse map from pair to matrix index; we'll need it.
-  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > pair_to_matrix;
-  GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix);
-
-  // get lists of matrix per segment in the pair representation.
-  std::vector<std::vector<std::pair<int32, int32> > > pair_lists;
-  ConvertListsToPairLists(active_matrices, matrix_to_pair,
-                          &pair_lists);
-
-  std::vector<int32> time_offsets;
-  NormalizePairLists(&pair_lists, &time_offsets);
-
-  // Note: seg1 and seg2 are indexes into 'segment_ends', representing
-  // points in time (that happen to be the ends of segments).
-  int32 seg1, seg2;
-  if (!FindFirstRepeat(pair_lists,
-                       time_offsets,
-                       time_shift_per_segment,
-                       &seg1, &seg2)) {
-    KALDI_VLOG(2) << "Could not find repeats of variables.";
-    return false;
-  }
-
-  // reverse the normalization for segments seg1 and seg2.
-  for (size_t i = 0; i < pair_lists[seg1].size(); i++)
-    pair_lists[seg1][i].second += time_offsets[seg1];
-  for (size_t i = 0; i < pair_lists[seg2].size(); i++)
-    pair_lists[seg2][i].second += time_offsets[seg2];
-  std::vector<int32> seg1_matrices, seg2_matrices;
-  PairListToMatrixList(pair_lists[seg1], pair_to_matrix, &seg1_matrices);
-  PairListToMatrixList(pair_lists[seg2], pair_to_matrix, &seg2_matrices);
-
-  int32 time_difference = time_offsets[seg2] - time_offsets[seg1];
-  CheckIdentifiedMatrices(*computation_, seg1_matrices, seg2_matrices,
-                          time_difference);
-
-
-  FormInfiniteLoop(segment_ends[seg1], segment_ends[seg2], computation_);
-
-  AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_);
-
-  RenumberComputation(computation_);
-
-  FixGotoLabel(computation_);
-
-  return true;
-}
-
-
-void OptimizeLoopedComputation(const Nnet &nnet,
-                               NnetComputation *computation) {
-  ComputationLoopedOptimizer optimizer(nnet, computation);
-  optimizer.Optimize();
-}
-
-
-
-void FixGotoLabel(NnetComputation *computation) {
-  int32 num_commands = computation->commands.size();
-  if (num_commands == 0)
-    return;
-  for (int32 c = num_commands - 1; c >= 0; c--) {
-    if (computation->commands[c].command_type == kGotoLabel) {
-      int32 dest_command = computation->commands[c].arg1;
-      if (static_cast<size_t>(dest_command) <  computation->commands.size() &&
-          computation->commands[dest_command].command_type == kNoOperationLabel)
-        return;  // nothing to fix.
-      for (int32 d = 0; d + 1 < num_commands; d++) {
-        if (computation->commands[d].command_type == kNoOperationLabel) {
-          computation->commands[c].arg1 = d;
-          return;
-        }
-      }
-      KALDI_ERR << "Label not found.";
-    } else if (computation->commands[c].command_type == kProvideOutput) {
-      // sometimes kProvideOutput commands are temporarily ordered after
-      // the kGotoLabel command, and we need to work in that case.
-      continue;
-    } else {
-      // it loks like there is no 'goto' command in this computation-
-      // if there were, it would be right at the end, possibly followed by
-      // kProvideOutput commands.
-      break;
-    }
-  }
-}
-
-
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index f3f27a12c8e..e224983f847 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -52,11 +52,14 @@ struct NnetOptimizeOptions;  // Forward declaration.
    may be sub-matrices of larger matrices.
 
    Note: the following
-     - Define last-access(submatrix) as the
+     - Define last-access(submatrix) as:
+       If matrix-of(submatrix) is an output, then num-commands, otherwise the
        last command that accesses that submatrix for either read or write.  [note:
        deallocation does not count as a read or write operation].
-     - Define first-access(submatrix) as the first command not of type kAlloc*
-       that accessed that submatrix for either read or write.
+     - Define first-access(submatrix) as:
+       If matrix-of(submatrix) is an input, then -1, otherwise the first command
+       that is *not* an allocation command that accessed that submatrix for either
+       read or write.
      - Define last-write-access(submatrix) as the last command-index that accessed
        the submatrix in a write operation, or -1 if there is no such command (this
        could happen for inputs).
@@ -96,41 +99,53 @@ struct NnetOptimizeOptions;  // Forward declaration.
    Otherwise (cases (b) and (c), in-place propagate or backprop), we insist that:
      - first-access(s2) == C
      - last-access(s1) == C
-   Note: in either case, these conditions imply that m2/s2 is not an input and m1/s1 is
-   not an output.  [i.e. s1 *may* be an input and s2 *may* be an output].
-
-   We can explain the procedure for both left-merge and right-merge in one, because
-   it's the same.  Define s_to_keep and m_to_keep as s1 and m1 if we're left-merging
-   and s2 and m2 if we're right-merging, and s_to_discard and m_to_discard the opposite
-   way.
-
-   The procedure to merge in general is as follows:
+   Note: in either case, these conditions imply that s2 is not an input and s1 is
+   not an output.
 
+   The sequence of things we have to do for a right-merge (in which we delete
+   s1,m1) is as follows:
      - All submatrices that reference m1, make them reference m2 instead.
-       [later we'll renumber so that there are no duplicates.]  This automatically
-       takes care of making the input and output and allocation/deallocation
-       commands refer to the right matrix, in most cases.
-     - We need to get rid of duplicate or unnecessary allocation commands:
-       If m_to_discard is an input then get rid of the allocation command for
-       m_to_keep; otherwise get rid of the allocation command of m_to_discard.
-     - We need to get rid of duplicate or unnecessary deallocation commands:
-       If m_to_discard is an output then get rid of the deallocation command
-       for m_to_keep; otherwise get rid of the deallocation command for
-       m_to_discard.
+       [later we'll renumber so that there are no duplicates.]
+     - If m1 was an input, replace it as an input with m2 and remove the
+       command that allocated m2.
+     - If it was an assignment [case (a)], replace the assignment command with a
+       no-op.
+     - If both m1 and m2 have commands that allocate them, keep only the
+       allocation command for m2, and make sure that it zeroes the data (we can
+       later change to undefined if allowed) and that it's before the first
+       non-allocation access of m1.  Otherwise remove any allocation commands
+       (the merged variable is an input).
+     - If both m1 and m2 have commands that deallocate them, keep only the
+       deallocation command for m2, and make sure that it's after the last
+       access of m1 (otherwise delete any deallocation command, because m2 must
+       be an output).  [note: previously we kept the later of the 2 commands,
+       but this had the effect of making inaccurate the Analyzer info for
+       a matrix (m2) that might later be used.
+     - If m1 had stride_type == kStrideEqualNumCols, set m2's stride_type
+       to kStrideEqualNuMCols.
+
+
+   The sequence of things we have to do for a right-merge (in which we delete
+   s1,m1) is as follows:
+     - All submatrices that reference m2, make them reference m1 instead.
+       [later we'll renumber so that there are no duplicates.]
+     - If m2 was an output, replace it as an output with m1 and remove the
+       command that deallocated m1.
+     ... the last four bullet-points, regarding removing the assignment command,
+        and allocation and deallocation, and stride-type, are the same as for a
+        left-merge, except swap m1 and m2.
 
    At the end when we call RemoveOrphanMatrices(), the renumbering code will
    automatically detect that there are duplicate submatrices, and will merge
    them, as well as removing the now-unused matrix indexes.  After merging, we
    will mark the variables (i.e. row-ranges) underlying s1 and s2 as being
-   "dirty" so they can no longer be merged during the lifetime of this class--
-   this is so we don't have to think to hard; we apply this optimization
-   multiple times until it makes no change (see
-   nnet-optimize.cc:VariableMerginOptimization()).
+   "dirty" so they can no longer be merged during the lifetime of this class.
  */
 class VariableMergingOptimizer {
  public:
   VariableMergingOptimizer(const NnetOptimizeOptions &config,
                            const Nnet &nnet,
+                           const ComputationRequest &request,
                            NnetComputation *computation);
   // Note: you can call this only once.  If it returns true, it means it has
   // merged variables.  In this case, you have the option to instantiate another
@@ -155,10 +170,20 @@ class VariableMergingOptimizer {
   ///  @param s2   [in]     A submatrix-index s2 > 0
   std::pair<bool,bool> MayBeMerged(int32 command, int32 s1, int32 s2) const;
 
-  // Merges to matrices, whether left merge or right merge.  s_to_keep and
-  // s_to_discard are the submatrix-indexes we will keep and discard
-  // respectively (these are s1 and s2 in some order.
-  void DoMerge(int32 command_index, int32 s_to_keep, int32 m_to_discard);
+  // performs the left merge.  Search for left-merge in the comment
+  // above the class declaration for details.
+  void DoLeftMerge(int32 command_index, int32 s1, int32 s2);
+
+  // performs the right merge.  Search for right-merge in the comment
+  // above the class declaration for details.
+  void DoRightMerge(int32 command_index, int32 s1, int32 s2);
+
+  // Performs the actions common to both left and right merges, regarding
+  // removing the assignment command, and allocation and deallocation (called
+  // from DoLeftMerge and DoRightMerge).  The m_to_keep and m_to_discard
+  // are the matrix-indexes we will keep and discard respectively.
+  void DoMergeCommon(int32 command_index, int32 m_to_keep,
+                     int32 m_to_discard);
 
   /// Marks the variables underlying submatrix 's' as dirty
   void MarkAsDirty(int32 s);
@@ -167,6 +192,7 @@ class VariableMergingOptimizer {
 
   const NnetOptimizeOptions &config_;
   const Nnet &nnet_;
+  const ComputationRequest &request_;
   NnetComputation *computation_;
 
   Analyzer analyzer_;
@@ -182,29 +208,184 @@ class VariableMergingOptimizer {
 };
 
 
+/** This class is responsible for consolidating the model-update part of
+    backprop commands, for components in (e.g.) recurrent networks that need to
+    have many separate backprop commands, into more efficient single commands
+    operating on consolidated data in larger matrices.  This is useful for
+    recurrent networks.  */
+class ModelUpdateConsolidator {
+ public:
+  ModelUpdateConsolidator(const Nnet &nnet,
+                          NnetComputation *computation);
+  void ConsolidateModelUpdate();
+ private:
+  void ConsolidateUpdateForComponent(
+      int32 component,
+      const std::vector<int32> &backprop_commands);
+
+  /// This function, called at the end of ConsolidateModelUpdate(), takes the
+  /// commands that we have put in extra_commands_, final_commands_ and
+  /// final_deallocate_commands_, and puts them in the appropriate place in
+  /// computation->commands_.
+  void AddCommandsToComputation();
+
+  /// You call this function when you want to consolidate the values of a list
+  /// of submatrices taken just prior to particular commands.  The input
+  /// 'commands' and 'submatrices' lists must be the same size, and size must be
+  /// > 1.  This function will create a new matrix that is the row-wise
+  /// concatentation of all these submatrices, with values taken just prior to
+  /// the respective command indexes.  This function will will add to
+  /// extra_commands_ the commands to do the copying at the appropriate places
+  /// (at the supplied command indexes; they will be inserted just before).  The
+  /// return value is the submatrix index of a submatrix that represents the
+  /// whole of the consolidated matrix.  This command will insert, at the
+  /// beginning of the computation (in extra_commands_[0]), a command to
+  /// initialize the matrix; and will append to final_deallocate_commands_ the
+  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
+  /// nonempty, this function will also update computation_->matrix_debug_info
+  /// with suitable values for the newly added matrix
+  int32 ConsolidateSubmatrices(
+      const std::vector<int32> &commands,
+      const std::vector<int32> &submatrices);
+
+  /// This function, called from ConsolidateSubmatrices, will
+  /// update 'debug_info' by appending the corresponding 'indexes' from
+  /// the existing debug info for this submatrix.  It will also set
+  /// the 'is_deriv' of '*debug_info' to the same value as the
+  /// debug info for 'submatrix_index', and set the 'node_index' to the
+  /// 'node_index' in the debug info for that submatrix-index.
+  /// It requires that computation_->matrix_debug_info be nonempty.
+  void AppendDebugInfoForSubmatrix(
+      int32 submatrix_index,
+      NnetComputation::MatrixDebugInfo *debug_info) const;
 
-/**
-   This optimization consolidates
-   the model-update part of
-   backprop commands, for components in (e.g.) recurrent networks that need to
-   have many separate backprop commands, into more efficient single commands
-   operating on consolidated data in larger matrices.  This is useful for
-   recurrent networks.  The resulting computation separates the backprop for
-   data-derivatives from the model-update part of backprop.
- */
-void ConsolidateModelUpdate(const Nnet &nnet,
-                            NnetComputation *computation);
+  const Nnet &nnet_;
+  NnetComputation *computation_;
 
+  // Indexed by the original command index in *computation_ (and sized to the
+  // original number of commands in *computation_ before we added anything),
+  // extra_commands_[c] contains a list of commands that need to be inserted
+  // just before command c in the previously existing computation.
+  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
+
+  // This is as list of kBackprop commands that will be placed after the
+  // commands in 'computation_->commands' and 'extra_commands_', but before
+  // the 'final_deallocate_commands_'.
+  std::vector<NnetComputation::Command> final_commands_;
+  // This is a list of commands to deallocate our 'consolidated' matrices; the
+  // commands will be placed after the commands in 'final_commands_'.
+  std::vector<NnetComputation::Command> final_deallocate_commands_;
+};
 
 
+// We declare this class in the .cc file, we don't need to export it.
+// It's used inside RenumberComputation.
+class ComputationRenumberer {
+ public:
+  ComputationRenumberer(NnetComputation *computation):
+      computation_(computation) { }
+
+  void Renumber();
+ private:
+  // this function removes unused vectors within the indexes_multi_ array, i.e.
+  // ones that are not referenced in the computation.
+  void RemoveUnusedIndexesMulti();
+  // this function computes the submatrix_is_used_ vector, saying whether each
+  // of the original submatrices is referenced somewhere.
+  void ComputeSubmatrixIsUsed();
+  // this function computes the matrix_is_used_ vector (from the
+  // submatrix_is_used_ vector, from computation_->input_output_info, and from
+  // computation_->commands, saying whether each of the original matrices is
+  // referenced somewhere, directly or indirectly.
+  void ComputeMatrixIsUsed();
+  // This function sets up mappings from old to new matrix and submatrix indexes,
+  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
+  void SetUpMappings();
+  // This function renumbers submatrix indexes appearing within commands and
+  // indexes_multi_, and then removes unused submatrices from the list of
+  // submatrices while leaving the matrix-indexes at their old values (they will
+  // be mapped by RenumberMatrices()).
+  void RenumberSubmatrices();
+  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
+  // and 'input_output_info'; renumber 'matrices' and if applicable
+  // 'debug_info'.
+  void RenumberMatrices();
+  // removes duplicates within the indexes_multi array itself.
+  void RemoveIndexesMultiDuplicates();
+  // removes unused elements and duplicates within 'computation->indexes'
+  void RenumberIndexes();
+  // removes unused elements and duplicates within 'computation->indexes_ranges'
+  void RenumberIndexesRanges();
+
+  struct SubMatrixHasher {
+    SubMatrixHasher() { }
+    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
+      // these numbers are arbitrarily chosen primes.
+      return submat.matrix_index +
+          19553 * submat.row_offset +
+          29297 * submat.num_rows +
+          42209 * submat.col_offset +
+          56527 * submat.num_cols;
+    }
+  };
+
+
+  // Here, T will be int32 or std::pair<int32,int32>
+  template <class T>
+  struct PointerCompare {
+    // This provides an operator < on two vectors of ints or pairs of ints.  It
+    // is designed to provide a total order on the vectors while accessing as
+    // small a portion of the vectors' data as possible.  It's used in removing
+    // duplicates from computation_->indexes_multi and computation_->indexes.
+    // First it compares the length, then it does lexicographical compare.
+    bool operator ()(const std::vector<T> *ptr1,
+                     const std::vector<T> *ptr2) const {
+      size_t size1 = ptr1->size(), size2 = ptr2->size();
+      if (size1 < size2) return true;
+      else if (size1 > size2) return false;
+      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
+                                    // lexicographical comparison.
+    }
+  };
+
+  /// creates a renumbering that removes the elements in "to_remove",
+  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
+  /// the vector [ 0, -1, 1 ].
+  static void CreateRenumbering(int32 old_num_elements,
+                                const std::vector<int32> &to_remove,
+                                std::vector<int32> *renumbering);
+
+  /// creates a renumbering from old to new index that removes the unused
+  /// elements, e.g. if used == [ true, false, true, true], would output the
+  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
+  /// number of elements of 'used' that were true.
+  static int32 CreateRenumbering(const std::vector<bool> &used,
+                                 std::vector<int32> *renumbering);
+
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index is used somewhere in the computation (always true for
+  // the zeroth element).
+  std::vector<bool> submatrix_is_used_;
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index will be kept; this is like submatrix_is_used_; but for
+  // duplicate submatrices, all but the first duplicate will be marked false).
+  std::vector<bool> submatrix_is_kept_;
+  // vector of bool indexed by original-matrix-index > 0, that is true if a
+  // matrix-index is used somewhere in the computation, directly or indirectly.
+  // always true for the zeroth element.
+  std::vector<bool> matrix_is_used_;
+  NnetComputation *computation_;
+  int32 num_matrices_new_;
+  int32 num_submatrices_new_;
+  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
+                                         // new-matrix-index.  -1 for removed
+                                         // ones.
+  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
+                                            // gives new-submatrix-index.  -1
+                                            // for removed ones.
+};
+
 
-// Class DerivativeTimeLimiter is used inside LimitDerivativeTimes().
-// Its function is to modify the computation so that we don't work
-// with derivatives outside of a specified range of t values; this is
-// useful, for instance, in BLSTMs where you might have a fair amount of
-// left and right context in the training examples but don't want to
-// propagate the derivatives to there.
-//
 // We require that the computation have debug info set up
 // (!matrix_debug_info.empty()) and that this be the first
 // optimization you perform.  This means that the debug_info will
@@ -221,6 +402,11 @@ class DerivativeTimeLimiter {
 
  private:
 
+  // This command ensures that for each matrix m there is a corresponding
+  // submatrix that spans the entire matrix, and stores its index in
+  // entire_submatrix_[m].
+  void EnsureMatricesHaveEntireSubmatrices();
+
   // sets up matrix_prune_info_.
   void ComputeMatrixPruneInfo();
 
@@ -316,7 +502,7 @@ class DerivativeTimeLimiter {
 
   // for each matrix index > 0, the index of a submatrix that consists of
   // the entirety of that matrix.
-  std::vector<int32> whole_submatrices_;
+  std::vector<int32> entire_submatrix_;
 
   std::vector<MatrixPruneInfo> matrix_prune_info_;
 
@@ -337,6 +523,10 @@ class DerivativeTimeLimiter {
 };
 
 
+// This utility function, used in code that calls LimitDerivativeTimes(), returns
+// the largest time 't' in any of the 'outputs' in the computation request,
+// or crashes if there are no outputs (or no cindexes in those outputs).
+int32 MaxOutputTimeInRequest(const ComputationRequest &request);
 
 // This is the top-level interface to limit the times on which derivatives are
 // computed (e.g. for truncated BPTT); internally it uses class
@@ -347,6 +537,7 @@ void LimitDerivativeTimes(const Nnet &nnet,
                           int32 max_deriv_time,
                           NnetComputation *computation);
 
+
 /**  This function, used in 'shortcut' compilation where we first compile a
      smaller computation with the same structure but only 2 distinct 'n'
      values, works out whether a computation is 'decomposable'; if so,
@@ -363,69 +554,57 @@ void LimitDerivativeTimes(const Nnet &nnet,
         'regular' structure, is as follows:
           - The 't' and 'x' values present are the same for each 'n',
           - The order in which the indexes appear is EITHER of the following:
-             - The 'n' index varies 'fast', i.e. the order is:
+             - The 'n' varies the most rapidly, i.e. the order is:
                  (t1,x1,0), (t1,x1,1) ... (t1,x1,N-1) \
                  (t2,x2,0), (t2,x2,1) ... (t2,x2,N-1)  ...
-             - The 'n' index varies 'slowly', i.e. the order is:
+             - The 'n' varies the least rapidly, i.e. the order is:
                  (t1,x1,0), (t2,x2,0) ...  \
                  (t1,x1,1), (t2,x2,1) ...  \
                  ...                       \
                  (t1,x2,N-1), (t2,x2,N-1) ...
             In either case, there does not have to be any particular rhyme or
-            reason to the order of the t and x values; the regularity on 'n' is
+            reason to the order of the t and x values, the regularity on 'n' is
             all that we care about.
  */
 bool ComputationIsDecomposable(const ComputationRequest &request,
                                ComputationRequest *mini_request,
-                               int32 *num_n_values);  // TODO: implement this.
-
+                               int32 *num_n_values);
 
 /**
-  This function is used in 'shortcut' compilation to expand a computation
-  that has been compiled for exactly 2 'n' values, to one that is suitable
-  for some num_n_values > 2.
-     @param [in] computation  The computation that was compiled for exactly
-                              2 'n' values (n=0 and n=1)
-     @param [in] need_debug_info True if we want to retain the 'debug_info'
-                              in the output 'expanded_computation'.  In any
-                              case, the 'debug_info' is required in the
-                              input computation.
-     @param [in] num_n_values The number of 'n' values we want in the output
-                              computation
-     @param [out] expanded_computation  The expanded computation.
-
-     @return  This function returns true if it succeeded, and false if it
-              could not expand the computation for some reason (e.g. there
-              was some non-simple component where the 'PrecomputedIndexes'
-              object could not be suitably expanded.  If it returns false,
-              the output 'expanded_computation' is undefined (may contain junk).
+  This function is used in 'shortcut' compilation
  */
-bool ExpandComputation(const NnetComputation &computation,
-                       bool need_debug_info,
-                       int32 num_n_values,
-                       NnetComputation *expanded_computation);
-
-
-
-/// This function detects cases where commands of type kCopyRows, kAddRows or
-/// kAddToRows can be converted to commands of type kMatrixCopy or kMatrixAdd,
-/// and converts them (this may involve adding submatrices).  After doing this
-/// you should at some point do RenumberComputation(), which will remove any
-/// now-unused members of computation->indexes.
-/// This function returns true if it made any changes to the computation.
-bool ReplaceRowWithMatrixOps(NnetComputation *computation);
-
-/// This function detects submatrices and matrices that are never used (e.g. due
-/// to changes made in other optimization code), and members of indexes,
-/// indexes_multi and indexes_ranges that are unused or are duplicates, and
-/// removes them from the computation by way of suitable renumbering.  It does
-/// not remove no-ops from computation->commands_; to do that, call
-/// RemoveNoOps(computation).
+bool ExpandComputation(const Computation &computation,
+                       int32 num_n_vlues,
+                       Computation *expanded_computation)
+
+
+
+
+/// This function detects submatrices, matrices, and members of indexes_multi
+/// and indexes that are never used (e.g. due to changes made in other
+/// optimization code), and removes them from the computation by way of suitable
+/// renumbering.  It does not remove no-ops from computation->commands_; to do
+/// that, call RemoveNoOps(computation).
 void RenumberComputation(NnetComputation *computation);
 
 /// Removes commands of type kNoOperation in the computation.
 void RemoveNoOps(NnetComputation *computation);
 
+/// Wherever matrix orig_matrix_index appears in the input of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInInput(
+    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation);
+
+/// A helper function used in some optimization functions.
+/// Wherever matrix orig_matrix_index appears in the output of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInOutput(
+    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation);
+
 /// This function outputs to "submatrix_args" the addresses of a subset of
 /// arguments arg1 through arg6 in "command", that correspond to the indexes of
 /// submatrices.  This is useful in renumbering code.  Note: some of the
@@ -441,6 +620,7 @@ void IdentifySubmatrixArgs(NnetComputation::Command *command,
 void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
                            std::vector<int32*> *submatrix_args);
 
+
 /// This function outputs to "submatrix_args" the addresses of integers in
 /// 'computation' that correspond to submatrices.  These may be present in
 /// 'commands', and in 'indexes_multi'.  This is useful in renumbering code.
@@ -451,6 +631,32 @@ void IdentifySubmatrixArgsInComputation(NnetComputation *computation,
                                         std::vector<int32*> *submatrix_args);
 
 
+/// This function outputs to "matrix_args" the addresses of a subset of the
+/// arguments arg1 through arg6 in "command", that correspond to the indexes of
+/// matrices.  This is useful in renumbering code.  (Note: only a few types of
+/// command use matrix indexes).
+void IdentifyMatrixArgs(NnetComputation::Command *command,
+                        std::vector<int32*> *matrix_args);
+
+/// This function outputs to "matrix_args" the addresses of a subset of the
+/// arguments arg1 through arg6 in commands in "commands", that correspond to
+/// the indexes of matrices.  This is useful in renumbering code.  (Note: only a
+/// few types of command use matrix indexes).
+void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *command,
+                        std::vector<int32*> *matrix_args);
+
+/// This function outputs to "matrix_args" the addresses of indexes inside
+/// 'computation' that correspond to matrices.  These live inside
+/// computation->commands and computation->input_output_info; and if
+/// 'include_from_submatrices' is true, then the matrix-indexes present in
+/// computation->submatrices[*].matrix_index will be included too.  Zeros may be
+/// present if there were optional arguments; we do include pointers to them,
+/// but you can just ignore them.
+void IdentifyMatrixArgsInComputation(bool include_from_submatrices,
+                                     NnetComputation *computation,
+                                     std::vector<int32*> *matrix_args);
+
+
 /// Identifies in the vector of commands, arguments that correspond to indexes
 /// into the computation's indexes_multi array, and outputs a list of pointers
 /// to those arguments to 'indexes_multi_args'.  Useful in renumbering code.
@@ -475,26 +681,7 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
-/// This function tries to optimize computation 'computation' for an 'looped'
-/// computation.  It expects as input a computation with no backprop but with
-/// multiple 'segments' separated by command kNoOperation, where each segment
-/// corresponds to a new chunk of input and output.  It tries to locate a pair
-/// of segment boundaries, with command indexes c1 and c2, where the active
-/// matrices have the same debug-info other than a time offset and can be
-/// identified with each other, and the no-op command at c2 can be replaced with
-/// 'got c1', creating a computation that 'goes on forever'.
-/// If it can't do this, it does nothing.  You can figure out that this is the
-/// case by checking whether kGotoLabel is the last command in the computation.
-/// [If this optimization fails, the whole computation may have to be
-/// regenerated with more segments.]
-void OptimizeLoopedComputation(const Nnet &nnet,
-                               NnetComputation *computation);
-
-
-/// This function ensures that the arg1 of a final command of type kGotoLabel is
-/// the same as the command with type kNoOperationLabel.  This is necessary
-/// if you do any other type of optimization after 'OptimizeLoopedComputation()'.
-void FixGotoLabel(NnetComputation *computation);
+
 
 
 /*
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index a1a62e3944c..9d6ff739768 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -34,13 +34,7 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &propagate_in_place);
   ExpectToken(is, binary, "<BackpropInPlace>");
   ReadBasicType(is, binary, &backprop_in_place);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<ReplaceRowWithMatrixOps>") {
-    ReadBasicType(is, binary, &replace_row_with_matrix_ops);
-    ReadToken(is, binary, &tok);
-  }
-  KALDI_ASSERT(tok == "<ConvertAddition>");
+  ExpectToken(is, binary, "<ConvertAddition>");
   ReadBasicType(is, binary, &convert_addition);
   ExpectToken(is, binary, "<RemoveAssignments>");
   ReadBasicType(is, binary, &remove_assignments);
@@ -58,7 +52,7 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &min_deriv_time);
   ExpectToken(is, binary, "<MaxDerivTime>");
   ReadBasicType(is, binary, &max_deriv_time);
-
+  std::string tok;
   ReadToken(is, binary, &tok);
   if (tok == "<MaxDerivTimeRelative>") {
     ReadBasicType(is, binary, &max_deriv_time_relative);
@@ -79,8 +73,6 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, propagate_in_place);
   WriteToken(os, binary, "<BackpropInPlace>");
   WriteBasicType(os, binary, backprop_in_place);
-  WriteToken(os, binary, "<ReplaceRowWithMatrixOps>");
-  WriteBasicType(os, binary, replace_row_with_matrix_ops);
   WriteToken(os, binary, "<ConvertAddition>");
   WriteBasicType(os, binary, convert_addition);
   WriteToken(os, binary, "<RemoveAssignments>");
@@ -201,8 +193,9 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet,
       continue;  // nothing to do.
     if (computation->commands[allocate_command].command_type !=
         kAllocMatrixZeroed) {
-      continue;  // already leaving it undefined, or it's an input, so nothing
-                 // to do.
+      KALDI_ASSERT(computation->commands[allocate_command].command_type ==
+                   kAllocMatrixUndefined);
+      continue;  // already leaving it undefined, so nothing to do.
     }
     std::vector<int32> variables_for_matrix;
     a.variables.AppendVariablesForMatrix(matrix_index, &variables_for_matrix);
@@ -301,8 +294,7 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
     if (command.command_type == kAllocMatrixZeroed ||
         command.command_type == kAllocMatrixUndefined ||
         command.command_type == kDeallocMatrix) {
-      int32 s = command.arg1, m = computation->submatrices[s].matrix_index,
-          num_rows = computation->matrices[m].num_rows,
+      int32 m = command.arg1, num_rows = computation->matrices[m].num_rows,
           num_cols = computation->matrices[m].num_cols,
           num_cols_mod = num_cols * (
               computation->matrices[m].stride_type == kDefaultStride ? 1 : -1);
@@ -344,22 +336,33 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
           kAllocMatrixFromOtherZeroed;
   }
   RemoveNoOps(computation);
-  FixGotoLabel(computation);
 }
 
 
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
+                                 const ComputationRequest &request,
                                  NnetComputation *computation) {
   bool changed = true;
   while (changed) {
     changed = false;
-    VariableMergingOptimizer opt(config, nnet, computation);
+    VariableMergingOptimizer opt(config, nnet, request, computation);
     if (opt.MergeVariables())
       changed = true;
   }
 }
 
+// This is a simplified top-level interface to the model-update consolidation
+// code from class ModelUpdateConsolidator.
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            const ComputationRequest &request,
+                            NnetComputation *computation) {
+  if (!request.need_model_derivative)
+    return;   // An optimization; there would be nothing to do in this case.
+  ModelUpdateConsolidator consolidator(nnet, computation);
+  consolidator.ConsolidateModelUpdate();
+}
+
 
 void ConvertAdditionToAssignment(const Nnet &nnet,
                                  NnetComputation *computation) {
@@ -411,30 +414,15 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
   }
 }
 
-
-int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
-  int32 ans = std::numeric_limits<int32>::min();
-  for (size_t i = 0; i < request.outputs.size(); i++) {
-    const std::vector<Index> &indexes (request.outputs[i].indexes);
-    std::vector<Index>::const_iterator iter = indexes.begin(),
-        end = indexes.end();
-    for (; iter != end; ++iter)
-      if (iter->t > ans)
-        ans = iter->t;
-  }
-  if (ans == std::numeric_limits<int32>::min()) {
-    KALDI_ERR << "Failed to find any output indexes in computation request.";
-  }
-  return ans;
-}
-
-
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              int32 max_output_time_in_request,
+              const ComputationRequest &request,
               NnetComputation *computation) {
+  if (!config.optimize)
+    return;
+
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, true);
+    CheckComputation(nnet, request, *computation, true);
 
   { // Call LimitDerivativeTimes().
     // this will do nothing unless --min-deriv-time or --max-deriv-time
@@ -442,91 +430,50 @@ void Optimize(const NnetOptimizeOptions &config,
     int32 max_deriv_time = config.max_deriv_time;
     if (config.max_deriv_time_relative != std::numeric_limits<int32>::max())
       max_deriv_time = config.max_deriv_time_relative +
-          max_output_time_in_request;
+          MaxOutputTimeInRequest(request);
     LimitDerivativeTimes(nnet, config.min_deriv_time,
                          max_deriv_time, computation);
   }
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, true);
+    CheckComputation(nnet, request, *computation, true);
 
-  if (config.optimize && config.consolidate_model_update)
-    ConsolidateModelUpdate(nnet, computation);
+  if (config.consolidate_model_update)
+    ConsolidateModelUpdate(nnet, request, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, true);
+    CheckComputation(nnet, request, *computation, true);
 
-  if (config.optimize && config.convert_addition) {
+  if (config.convert_addition)
     ConvertAdditionToAssignment(nnet, computation);
-    if (GetVerboseLevel() >= 4)
-      CheckComputation(nnet, *computation, true);
-  }
 
-  if (config.optimize &&
-      (config.remove_assignments || config.backprop_in_place ||
-       config.propagate_in_place)) {
-    VariableMergingOptimization(config, nnet, computation);
-    if (GetVerboseLevel() >= 4)
-      CheckComputation(nnet, *computation, false);
-  }
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, true);
 
-  if (config.optimize && config.replace_row_with_matrix_ops) {
-    if (ReplaceRowWithMatrixOps(computation)) {
-      // if anything was changed...
-
-      // We have to call RenumberComputation() to get rid of any removed
-      // indexes... actually this could be a little wasteful, but unfortunately
-      // it doesn't seem like we'd otherwise be doing any renumbering past this
-      // point.
-      RenumberComputation(computation);
-      if (GetVerboseLevel() >= 4)
-        CheckComputation(nnet, *computation, false);
-    }
-  }
+  if (config.remove_assignments || config.backprop_in_place ||
+      config.propagate_in_place)
+    VariableMergingOptimization(config, nnet, request, computation);
 
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
 
-  if (config.optimize && config.initialize_undefined) {
+  if (config.initialize_undefined)
     RemoveUnnecessaryZeroing(nnet, computation);
-    if (GetVerboseLevel() >= 4)
-      CheckComputation(nnet, *computation, false);
-  }
 
-  if (config.optimize && config.move_sizing_commands) {
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
+
+  if (config.move_sizing_commands)
     MoveSizingCommands(nnet, computation);
-    if (GetVerboseLevel() >= 4)
-      CheckComputation(nnet, *computation, false);
-  }
 
-  // the looped computation optimization has to go before
-  // 'RemoveUnnecessaryAllocation()'.  We don't gate this by 'config.optimize'
-  // because it's necessary for looped computation to run.
-  if (config.optimize_looped_computation){
-    OptimizeLoopedComputation(nnet, computation);
-    if (GetVerboseLevel() >= 4)
-      CheckComputation(nnet, *computation, false);
-  }
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
 
-  if (config.optimize && config.allocate_from_other &&
-      !config.optimize_looped_computation) {
-    // Don't do this if it's an looped computation because we're not sure if it
-    // would be correct in that case, as written.  In any case the performance
-    // benefit is tiny.
+  if (config.allocate_from_other)
     RemoveUnnecessaryAllocation(nnet, computation);
-    if (GetVerboseLevel() >= 4)
-      CheckComputation(nnet, *computation, false);
-  }
-
-  // The following is not configurable because it is necessary for
-  // the computation to run correctly (we do it after compilation too,
-  // but the operations may have been put out of order by
-  // other optimizations.)
-  ConsolidateIoOperations(nnet, computation);
-
-  if (config.optimize_looped_computation)
-    FixGotoLabel(computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, *computation, false);
+    CheckComputation(nnet, request, *computation, false);
 }
 
 // ComputationRequests are distinguished by the names and indexes
@@ -552,32 +499,32 @@ size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spe
                   // it makes the hasher faster.
   StringHasher string_hasher;
   ans = string_hasher(spec.name);
-  std::vector<Index>::const_iterator iter = spec.indexes.begin(),
+  std::vector<Index>::const_iterator itr = spec.indexes.begin(),
       end = spec.indexes.end(),
       med = end;
-  if (med > iter + n)
+  if (med > itr + n)
     med = iter + n;
 
-  for (; iter != med; ++iter) {
-    ans += iter->n * 1619;
-    ans += iter->t * 15649;
-    ans += iter->x * 89809;
+  for (; itr != med; ++itr) {
+    ans += (*itr).n * 1619;
+    ans += (*itr).t * 15649;
+    ans += (*itr).x * 89809;
   }
   // after the first 'n' values, look only at every n'th value.  this makes the
   // hashing much faster, and in the kinds of structures that we actually deal
   // with, we shouldn't get unnecessary hash collisions as a result of this
   // optimization.
-  for (; iter < end; iter += n) {
-    ans += iter->n * 1619;
-    ans += iter->t * 15649;
-    ans += iter->x * 89809;
+  for (; iter < end; itr += n) {
+    ans += (*itr).n * 1619;
+    ans += (*itr).t * 15649;
+    ans += (*itr).x * 89809;
   }
   return ans;
 }
 
 void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
                                             NnetComputation *computation) {
-  if (computation_cache_.size() == config_.cache_capacity) {
+  if (computation_cache_.size() == cache_capacity_) {
     // full, locate the least-recently-accessed request
     const CacheType::iterator it =
         computation_cache_.find(access_queue_.front());
@@ -677,9 +624,7 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
       ComputationChecker checker(check_config, nnet_, *computation);
       checker.Check();
     }
-    Optimize(opt_config_, nnet_,
-             MaxOutputTimeInRequest(*request),
-             computation);
+    Optimize(opt_config_, nnet_, *request, computation);
     if (GetVerboseLevel() >= verbose_cutoff) {
       std::ostringstream os;
       computation->Print(os, nnet_);
@@ -700,173 +645,6 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
   return computation;
 }
 
-/// Split the computation up into segments bounded by kNoOperationMarker.  For
-/// each segment, a pair of command-indexes (start, end) is output to the vector
-/// 'segments', so the commands in the segment (not including
-/// kNoOperationMarker) are numbered from start ... end - 1.
-static void SplitComputationIntoSegments(
-    const NnetComputation &computation,
-    std::vector<std::pair<int32, int32> > *segments) {
-
-  int32 num_commands = computation.commands.size();
-  segments->clear();
-  int32 cur_start = 0;
-  for (int32 c = 0; c < num_commands; c++) {
-    if (computation.commands[c].command_type == kNoOperationMarker) {
-      segments->push_back(std::pair<int32, int32>(cur_start, c));
-      cur_start = c + 1;
-    }
-  }
-  segments->push_back(std::pair<int32, int32>(cur_start, num_commands));
-}
-
-// This is a helper function used in ConsolidateIoOperations().
-//
-// Suppose we had something like this before ConsolidateIoOperations() (as would
-// be printed by Print()
-
-//  c90: output m50 to user [for node: 'output']
-//  ...
-//  c100: [label for goto statement]
-//  c101: # computation segment separator [e.g., begin backward commands]
-//  ...
-//  c105: m62 = user input [for node: 'input']
-//  ...
-//  c190: output m79 to user [for node: 'output']
-//  ...
-//  c200: goto c100
-//
-//  this would get reordered to the following by ConsolidateIoOperations
-//  (the bulk of the code, before this function is called):
-//
-//  c99: [label for goto statement]
-//  c100: output m50 to user [for node: 'output']
-//  c101: # computation segment separator [e.g., begin backward commands]
-//  c102: m62 = user input [for node: 'input']
-//  ...
-//  c199: goto c199
-//  c200: output m79 to user [for node: 'output']
-//
-// Now command c200 is unreachable, but there is a similar command at c100
-// (after the goto) that will substitute.  However, the matrix indexes are different.
-// So we need to change the above so that the last two commands read:
-//  c199: m50.swap(m79}
-//  c200: goto c199
-void FixGotoOutputReordering(const Nnet &nnet,
-                             NnetComputation *computation) {
-  FixGotoLabel(computation);  // make sure the destination label of the goto statement was
-                              // correct.
-  int32 goto_command_index = -1;
-  for (int32 c = computation->commands.size(); c >= 0; c--)
-    if (computation->commands[c].command_type == kGotoLabel)
-      goto_command_index = c;
-  KALDI_ASSERT(goto_command_index > 0);
-  int32 goto_label_index = computation->commands[goto_command_index].arg1;
-
-  std::vector<int32> output_commands_after_goto,
-      output_commands_after_label;
-  for (int32 c = goto_command_index + 1;
-       c < static_cast<int32>(computation->commands.size()); c++) {
-    KALDI_ASSERT(computation->commands[c].command_type == kProvideOutput);
-    output_commands_after_goto.push_back(c);
-  }
-  for (int32 c = goto_label_index + 1;
-       c < goto_command_index; c++) {  // note: we break from this loop.
-    CommandType t = computation->commands[c].command_type;
-    if (t == kProvideOutput)
-      output_commands_after_label.push_back(c);
-    else if (t != kNoOperationMarker && t != kAcceptInput)
-      break;
-  }
-  if (output_commands_after_goto.size() != output_commands_after_label.size()) {
-    computation->Print(std::cerr, nnet);
-    KALDI_ERR << "Could not fix goto/output reordering, size mismatch.";
-  }
-  NnetComputation::Command goto_command = computation->commands[goto_command_index];
-  // be we'll be replacing the final kProvideOutput commands with
-  // kAllocMatrixFromOther [i.e. swap commands], and moving them one command
-  // backward; later we'll put the goto command at the end.
-  for (size_t i = 0; i < output_commands_after_goto.size(); i++) {
-    int32 c1 = output_commands_after_label[i],
-        c2 = output_commands_after_goto[i],
-        new_c2 = c2 - 1;
-    int32 s1 = computation->commands[c1].arg1,
-        s2 = computation->commands[c2].arg1;
-    // The following assert checks that the network node-index is the same...
-    // the idea is that the outputs should have been provided in the same order.
-    // I can think of no reason why the order might be different.
-    KALDI_ASSERT(computation->commands[c1].arg2 ==
-                 computation->commands[c1].arg2);
-    computation->commands[new_c2].command_type = kAllocMatrixFromOther;
-    computation->commands[new_c2].arg1 = s1;
-    computation->commands[new_c2].arg2 = s2;
-  }
-  // ... and move the goto command to the end.
-  computation->commands.back() = goto_command;
-}
-
-
-void ConsolidateIoOperations(const Nnet &nnet,
-                             NnetComputation *computation) {
-  bool ends_with_goto =
-      (!computation->commands.empty() &&
-       computation->commands.back().command_type == kGotoLabel);
-
-  // These segments, represented as (start-index, end-index),
-  // are segments of the computation separated by kNoOperationMarker.
-  std::vector<std::pair<int32, int32> > segments;
-  SplitComputationIntoSegments(*computation, &segments);
-
-  int32 num_commands = computation->commands.size();
-  std::vector<NnetComputation::Command> reordered_commands(num_commands);
-  // put kNoOperationMarker between all segments in the reordered commands.
-  for (size_t s = 0; s + 1 < segments.size(); s++)
-    reordered_commands[segments[s].second].command_type = kNoOperationMarker;
-
-  // for each segment we'll divide the commands up into those that must appear
-  // at the left of the segment (kAcceptInput for inputs and output-derivs), those
-  // that must appear in the middle (most commands), those that must appear
-  // on the right (kProvideOutput for output nodes and input derivatives).
-  std::vector<int32> left_commands, middle_commands, right_commands;
-
-  for (size_t s = 0; s < segments.size(); s++) {
-    int32 segment_start = segments[s].first,
-        segment_end = segments[s].second;
-    left_commands.clear();
-    middle_commands.clear();
-    right_commands.clear();
-    for (int32 c = segment_start; c < segment_end; c++) {
-      if (computation->commands[c].command_type == kProvideOutput) {
-        right_commands.push_back(c);
-      } else if (computation->commands[c].command_type == kAcceptInput) {
-        left_commands.push_back(c);
-      } else {
-        middle_commands.push_back(c);
-      }
-    }
-    std::vector<int32>::const_iterator iter = left_commands.begin(),
-        end = left_commands.end();
-    int32 c = segment_start;
-    for (; iter != end; ++iter, ++c)
-      reordered_commands[c] = computation->commands[*iter];
-    iter = middle_commands.begin();
-    end = middle_commands.end();
-    for (; iter != end; ++iter, ++c)
-      reordered_commands[c] = computation->commands[*iter];
-    iter = right_commands.begin();
-    end = right_commands.end();
-    for (; iter != end; ++iter, ++c)
-      reordered_commands[c] = computation->commands[*iter];
-    KALDI_ASSERT(c == segment_end);
-  }
-  computation->commands.swap(reordered_commands);
-
-  if (ends_with_goto)
-    FixGotoOutputReordering(nnet, computation);
-}
-
-
-
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 86c6427396a..732f11e29ac 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-optimize.h
 
-// Copyright      2015-2016  Johns Hopkins University (author: Daniel Povey)
-//                2015       Xiaohui Zhang
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -37,7 +37,6 @@ struct NnetOptimizeOptions {
   bool consolidate_model_update;
   bool propagate_in_place;
   bool backprop_in_place;
-  bool replace_row_with_matrix_ops;
   bool convert_addition;
   bool remove_assignments;
   bool allow_left_merge;
@@ -48,17 +47,12 @@ struct NnetOptimizeOptions {
   int32 min_deriv_time;
   int32 max_deriv_time;
   int32 max_deriv_time_relative;
-  // optimize_looped_computation is a 'hidden config' not available from
-  // the command line; it's set to true to enable the optimization for
-  // looped computation that turns a linear computation into a loop.
-  bool optimize_looped_computation;
 
   NnetOptimizeOptions():
       optimize(true),
       consolidate_model_update(true),
       propagate_in_place(true),
       backprop_in_place(true),
-      replace_row_with_matrix_ops(true),
       convert_addition(true),
       remove_assignments(true),
       allow_left_merge(true),
@@ -68,8 +62,7 @@ struct NnetOptimizeOptions {
       allocate_from_other(true),
       min_deriv_time(std::numeric_limits<int32>::min()),
       max_deriv_time(std::numeric_limits<int32>::max()),
-      max_deriv_time_relative(std::numeric_limits<int32>::max()),
-      optimize_looped_computation(false) { }
+      max_deriv_time_relative(std::numeric_limits<int32>::max()) {}
 
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "
@@ -121,39 +114,10 @@ struct NnetOptimizeOptions {
   bool operator == (const NnetOptimizeOptions &other) const;
 };
 
-
-/* This utility function, used in code that calls LimitDerivativeTimes() (and
-   required in code that calls Optimize(), returns the largest time
-   't' in any of the 'outputs' in the computation request, or crashes if there
-   are no outputs (or no cindexes in those outputs). */
-int32 MaxOutputTimeInRequest(const ComputationRequest &request);
-
-
-/** This is the top-level function for optimizing a computation.  Note: it
-    should really be called OptimizeAndPostprocess(), because there is at least
-    one thing it does (reordering I/O commands) that is necessary for a
-    computation to be run.
-
-    @param [in] config   The options that control, among other things,
-                         which optimizations to apply.
-    @param [in] nnet     The neural net for which the computation is being built
-    @param [in] max_output_time_in_request  This value is only needed when the
-                         max-deriv-time-relative config value is set in
-                         'config'.  It should be set to the largest 't' value
-                         encountered in any of the indexes in the 'output'
-                         IoSpecifications in the ComputationRequests used to
-                         compile the computation.  However if there are multiple
-                         ComputationRequests (i.e. it was an online computation)
-                         you can just set it to any value you want, because
-                         backpropagation is not supported so the
-                         max-deriv-time-relative configuration value would not
-                         have any effect.
-    @param [in,out] computation  The computation to be optimized; this function
-                         modifies it in-place.
- */
+/// This is the top-level function for optimizing a computation.
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              int32 max_output_time_in_request,
+              const ComputationRequest &request,
               NnetComputation *computation);
 
 // Hash function for ComputationRequest. It converts
@@ -208,15 +172,13 @@ struct CachingOptimizingCompilerOptions {
 class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
-                            const CachingOptimizingCompilerOptions config =
-                            CachingOptimizingCompilerOptions()):
-      nnet_(nnet), config_(config) { }
+                            const CachingOptimizingCompilerOptions &config):
+      nnet_(nnet), config_(config), cache_capacity_(capacity) { }
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
-                            const CachingOptimizingCompilerOptions config =
-                            CachingOptimizingCompilerOptions()):
+                            const CachingOptimizingCompilerOptions &config):
       nnet_(nnet), config_(config), opt_config_(opt_config) { }
 
   ~CachingOptimizingCompiler();
@@ -257,6 +219,9 @@ class CachingOptimizingCompiler {
                    NnetComputation *computation);
   // This function updates the recently accessed queue.
   void UpdateAccessQueue(CacheType::iterator &cit);
+  // This configuration value determines how many unique Computations
+  // to cache in our most-recently-used cache.
+  int32 cache_capacity_;
 };
 
 
@@ -300,6 +265,7 @@ void LimitDerivativeTimes(const Nnet &nnet,
 /// class ModelUpdateConsolidator.  Will fail if called a
 /// second time.
 void ConsolidateModelUpdate(const Nnet &nnet,
+                            const ComputationRequest &request,
                             NnetComputation *computation);
 
 /// This converts addition operations (things with Add in their names) to
@@ -312,6 +278,7 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
 /// This wraps class VariableMergingOptimizer in a simplified interface.
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
+                                 const ComputationRequest &request,
                                  NnetComputation *computation);
 
 
@@ -331,17 +298,6 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
                                  NnetComputation *computation);
 
 
-/// This optimization puts the input operations (kAcceptInput) and output
-/// operations (kProvideOutput) at the very beginning or end of segments of
-/// computation, respectively.
-///
-/// This is actually necessary for computations to be run easily, because if these
-/// commands were interspersed with the regular commands, you'd have to
-/// call computer.Run() between the individual AcceptInput() and GetOutput()
-/// function calls.
-void ConsolidateIoOperations(const Nnet &nnet,
-                             NnetComputation *computation);
-
 
 
 } // namespace nnet3

From d3acb9e41e8c3c57e528723922f5fbfc018f644d Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 12 Dec 2016 20:11:38 -0800
Subject: [PATCH 023/213] Add new type of optimization of per-row commands;
 finish some of the internal code for shortcut compilation.

---
 src/nnet3/nnet-compile.cc        |   18 +-
 src/nnet3/nnet-computation.cc    |  105 +-
 src/nnet3/nnet-computation.h     |   21 +-
 src/nnet3/nnet-compute.cc        |    4 +-
 src/nnet3/nnet-nnet.h            |    4 +-
 src/nnet3/nnet-optimize-utils.cc | 2450 ++++++++++++++++++++++++++----
 src/nnet3/nnet-optimize-utils.h  |  428 ++----
 src/nnet3/nnet-optimize.cc       |  337 +++-
 src/nnet3/nnet-optimize.h        |   75 +-
 9 files changed, 2696 insertions(+), 746 deletions(-)

diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 930887d85ea..d31e1ad5289 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -969,7 +969,8 @@ void Compiler::SetUpPrecomputedIndexes(
     NnetComputation *computation) {
   int32 num_steps = steps_.size();
   KALDI_ASSERT(computation->component_precomputed_indexes.empty());
-  computation->component_precomputed_indexes.push_back(NULL);
+  // the zeroth commponent is special, contains a NULL pointer.
+  computation->component_precomputed_indexes.resize(1);
   for (int32 step = 0; step < num_steps; step++) {
     StepInfo &step_info = steps_[step];
     int32 node_index = step_info.node_index;
@@ -999,7 +1000,20 @@ void Compiler::SetUpPrecomputedIndexes(
     } else {
       step_info.precomputed_indexes_index =
           computation->component_precomputed_indexes.size();
-      computation->component_precomputed_indexes.push_back(precomputed_indexes);
+
+      NnetComputation::PrecomputedIndexesInfo info;
+      info.data = precomputed_indexes;
+
+      if (!input_indexes.empty() && input_indexes.back().n == 1 &&
+          !output_indexes.empty() && output_indexes.back().n == 1) {
+        // If these conditions are true, it's *possible* that we are doing
+        // 'shortcut' compilation.  So just in case that's what's going on, we
+        // store 'input_indexes' and 'output_indexes, which are needed by
+        // the ExpandComputation() function that is used in that process.
+        info.input_indexes = input_indexes;
+        info.output_indexes = output_indexes;
+      }
+      computation->component_precomputed_indexes.push_back(info);
     }
   }
 }
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 046d8c824e3..bb3aaddc829 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -75,8 +75,9 @@ int32 ComputationRequest::IndexForOutput(
 }
 
 NnetComputation::~NnetComputation() {
-  for (size_t i = 0; i < component_precomputed_indexes.size(); i++)
-    delete component_precomputed_indexes[i];
+  // note: component_precomputed_indexes[0].data is the NULL pointer.
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    delete component_precomputed_indexes[i].data;
 }
 
 void NnetComputation::ComputeCudaIndexes() {
@@ -728,8 +729,9 @@ void NnetComputation::Read(std::istream &is, bool binary) {
 
 
   // delete any existing pointers in component_precomputed_indexes.
-  for (size_t i = 0; i < component_precomputed_indexes.size(); i++)
-    delete component_precomputed_indexes[i];
+  // note: component_precomputed_indexes[0] is the NULL pointer.
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    delete component_precomputed_indexes[i].data;
   component_precomputed_indexes.clear();
 
   size_t num_component_precomputed_indexes;
@@ -737,20 +739,33 @@ void NnetComputation::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &num_component_precomputed_indexes);
   KALDI_ASSERT(num_component_precomputed_indexes >= 0);
   component_precomputed_indexes.resize(num_component_precomputed_indexes);
-  ExpectToken(is, binary, "<ComponentPrecomputedIndexes>");
-  std::vector<ComponentPrecomputedIndexes*> component_precomputed_indexes_tmp;
-  for (size_t c = 0; c < num_component_precomputed_indexes; c++) {
-    bool is_null; // a boolean indicating whether the pointer should be NULL.
-    ReadBasicType(is, binary, &is_null);
-    if (!is_null) {
+
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<ComponentPrecomputedIndexes>") {
+    // Older on-disk format, before that code was extended for shortcut
+    // compilation.
+    component_precomputed_indexes.clear();
+    component_precomputed_indexes.resize(num_component_precomputed_indexes);
+    for (size_t c = 0; c < num_component_precomputed_indexes; c++) {
+      bool is_null; // a boolean indicating whether the pointer should be NULL.
+      ReadBasicType(is, binary, &is_null);
+      if (!is_null) {
+        ComponentPrecomputedIndexes* p = ComponentPrecomputedIndexes::ReadNew(is, binary);
+        component_precomputed_indexes[c].data = p;
+      }
+    }
+  } else {
+    KALDI_ASSERT(tok == "<PrecomputedIndexesInfo>");
+    for (size_t c = 1; c < num_component_precomputed_indexes; c++) {
       ComponentPrecomputedIndexes* p = ComponentPrecomputedIndexes::ReadNew(is, binary);
-      component_precomputed_indexes_tmp.push_back(p);
-    } else {
-      component_precomputed_indexes_tmp.push_back(NULL);
+      KALDI_ASSERT(p != NULL);
+      PrecomputedIndexesInfo &info = component_precomputed_indexes[c];
+      info.data = p;
+      ReadIndexVector(is, binary, &(info.input_indexes));
+      ReadIndexVector(is, binary, &(info.output_indexes));
     }
   }
-  component_precomputed_indexes = component_precomputed_indexes_tmp;
-
   size_t num_indexes;
   ExpectToken(is, binary, "<NumIndexes>");
   ReadBasicType(is, binary, &num_indexes);
@@ -829,14 +844,12 @@ void NnetComputation::Write(std::ostream &os, bool binary) const {
   if (!binary) os << std::endl;
   WriteToken(os, binary, "<NumComponentPrecomputedIndexes>");
   WriteBasicType(os, binary, component_precomputed_indexes.size());
-  WriteToken(os, binary, "<ComponentPrecomputedIndexes>");
-  for (size_t c = 0; c < component_precomputed_indexes.size(); c++) {
-    if (component_precomputed_indexes[c] != NULL) {
-      WriteBasicType(os, binary, false); // a boolean indicating whether the pointer is NULL.
-      component_precomputed_indexes[c]->Write(os, binary);
-    } else {
-      WriteBasicType(os, binary, true);
-    }
+  WriteToken(os, binary, "<PrecomputedIndexesInfo>");
+  for (size_t c = 1; c < component_precomputed_indexes.size(); c++) {
+    const PrecomputedIndexesInfo &info = component_precomputed_indexes[c];
+    info.data->Write(os, binary);
+    WriteIndexVector(os, binary, info.input_indexes);
+    WriteIndexVector(os, binary, info.output_indexes);
   }
 
   if (!binary) os << std::endl;
@@ -1072,6 +1085,7 @@ NnetComputation::NnetComputation(const NnetComputation &other):
     matrices(other.matrices),
     matrix_debug_info(other.matrix_debug_info),
     submatrices(other.submatrices),
+    component_precomputed_indexes(other.component_precomputed_indexes),
     indexes(other.indexes),
     indexes_multi(other.indexes_multi),
     indexes_ranges(other.indexes_ranges),
@@ -1079,33 +1093,30 @@ NnetComputation::NnetComputation(const NnetComputation &other):
     need_model_derivative(other.need_model_derivative),
     indexes_cuda(other.indexes_cuda),
     indexes_ranges_cuda(other.indexes_ranges_cuda) {
-  for (size_t i = 0; i < other.component_precomputed_indexes.size(); i++)
-      component_precomputed_indexes.push_back(
-          other.component_precomputed_indexes[i] == NULL ? NULL :
-          other.component_precomputed_indexes[i]->Copy());
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    component_precomputed_indexes[i].data =
+        component_precomputed_indexes[i].data->Copy();
 }
 
-
 NnetComputation& NnetComputation::operator = (const NnetComputation &other) {
-    matrices = other.matrices;
-    matrix_debug_info = other.matrix_debug_info;
-    submatrices = other.submatrices;
-    indexes = other.indexes;
-    indexes_multi = other.indexes_multi;
-    indexes_ranges = other.indexes_ranges;
-    commands = other.commands;
-    need_model_derivative = other.need_model_derivative;
-    indexes_cuda = other.indexes_cuda;
-    indexes_ranges_cuda = other.indexes_ranges_cuda;
-
-    for (size_t i = 0; i < component_precomputed_indexes.size(); i++)
-      delete component_precomputed_indexes[i];
-    component_precomputed_indexes.clear();
-    for (size_t i = 0; i < other.component_precomputed_indexes.size(); i++)
-      component_precomputed_indexes.push_back(
-          other.component_precomputed_indexes[i] == NULL ? NULL :
-          other.component_precomputed_indexes[i]->Copy());
-    return *this;
+  matrices = other.matrices;
+  matrix_debug_info = other.matrix_debug_info;
+  submatrices = other.submatrices;
+  indexes = other.indexes;
+  indexes_multi = other.indexes_multi;
+  indexes_ranges = other.indexes_ranges;
+  commands = other.commands;
+  need_model_derivative = other.need_model_derivative;
+  indexes_cuda = other.indexes_cuda;
+  indexes_ranges_cuda = other.indexes_ranges_cuda;
+
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    delete component_precomputed_indexes[i].data;
+  component_precomputed_indexes = other.component_precomputed_indexes;
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    component_precomputed_indexes[i].data =
+        component_precomputed_indexes[i].data->Copy();
+  return *this;
 }
 
 
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index da3a43bd15f..fd8cb06d06b 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -296,6 +296,25 @@ struct NnetComputation {
     void Read(std::istream &istream, bool binary);
     void Write(std::ostream &ostream, bool binary) const;
   };
+  struct PrecomputedIndexesInfo {
+    // For each step of the computation for which we might possibly need to store
+    // a ComponentPrecomputedIndexes object (and note that this is only applicable
+    // for non-simple Components), this struct stores some information.
+    // The primary data is in 'data', it's an object of type inheriting from
+    // ComponentPrecomputedIndexes.
+    // The 'input_indexes' and 'output_indexes' are the vectors that were provided
+    // to the function Component::PrecomputeIndexes() when generating these
+    // PrecomputedIndexes objects.  They currently only stored in cases where
+    // the 'n' values in the computation are numbered only zero and one, because
+    // these types of computations are compiled in 'shortcut' compilation, and
+    // in that case we'll need these indexes later in order to generate the
+    // 'expanded' computation (see the function ExpandComputation()).
+    ComponentPrecomputedIndexes *data;
+    std::vector<Index> input_indexes;
+    std::vector<Index> output_indexes;
+    PrecomputedIndexesInfo(): data(NULL) { }
+  };
+
 
   // "matrices" describes the sizes of the matrices that we use as variables in
   // the computation [note: index zero is reserved for an empty matrix].  Note:
@@ -323,7 +342,7 @@ struct NnetComputation {
   // the NULL pointer, which is used for "simple" components and others that do
   // not require precomputed indexes.
   // These are owned here.
-  std::vector<ComponentPrecomputedIndexes*> component_precomputed_indexes;
+  std::vector<PrecomputedIndexesInfo> component_precomputed_indexes;
 
   // used in kAddRows, kAddToRows, kCopyRows, kCopyToRows.  contains row-indexes.
   std::vector<std::vector<int32> > indexes;
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 75c0c464c90..d01327c8265 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -182,7 +182,7 @@ void NnetComputer::ExecuteCommand() {
       case kPropagate: {
         const Component *component = nnet_.GetComponent(c.arg1);
         ComponentPrecomputedIndexes *indexes =
-            computation_.component_precomputed_indexes[c.arg2];
+            computation_.component_precomputed_indexes[c.arg2].data;
         const CuSubMatrix<BaseFloat> input(GetSubMatrix(c.arg3));
         CuSubMatrix<BaseFloat> output(GetSubMatrix(c.arg4));
         component->Propagate(indexes, input, &output);
@@ -208,7 +208,7 @@ void NnetComputer::ExecuteCommand() {
                                     nnet_to_update_->GetComponent(c.arg1) :
                                     NULL);
         ComponentPrecomputedIndexes *indexes =
-            computation_.component_precomputed_indexes[c.arg2];
+            computation_.component_precomputed_indexes[c.arg2].data;
         const CuSubMatrix<BaseFloat> in_value(GetSubMatrix(c.arg3));
         const CuSubMatrix<BaseFloat> out_value(GetSubMatrix(c.arg4));
         const CuSubMatrix<BaseFloat> out_deriv(GetSubMatrix(c.arg5));
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index e999f20f4f5..19cfb3949ad 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -125,10 +125,10 @@ class Nnet {
 
   int32 NumNodes() const { return nodes_.size(); }
 
-  /// return component indexed c.  not a copy; not owned by caller.
+  /// Return component indexed c.  Not a copy; not owned by caller.
   Component *GetComponent(int32 c);
 
-  /// return component indexed c (const version).  not a copy; not owned by
+  /// Return component indexed c (const version).  Not a copy; not owned by
   /// caller.
   const Component *GetComponent(int32 c) const;
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index b2ebb22ad71..62bda3a17e1 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -33,8 +33,12 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kAllocMatrixZeroed:
     case kAllocMatrixUndefined:
     case kDeallocMatrix:
+      submatrix_args->push_back(&c->arg1);
+      break;
     case kAllocMatrixFromOther:
     case kAllocMatrixFromOtherZeroed:
+      submatrix_args->push_back(&c->arg1);
+      submatrix_args->push_back(&c->arg2);
       break;
     case kPropagate:
       submatrix_args->push_back(&c->arg3);
@@ -64,8 +68,13 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kCopyToRowsMulti:
       submatrix_args->push_back(&c->arg1);
       break;
+    case kAcceptInput: case kProvideOutput:
+      submatrix_args->push_back(&c->arg1);
+      break;
     case kNoOperation:
     case kNoOperationMarker:
+    case kNoOperationLabel:
+    case kGotoLabel:
       break;
     default:
       KALDI_ERR << "Unknown command type.";
@@ -87,40 +96,13 @@ void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
 }
 
 
-void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *commands,
-                        std::vector<int32*> *matrix_args) {
-  matrix_args->clear();
-  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
-      end = commands->end();
-  std::vector<int32*> this_matrix_args;
-  for (; iter != end; ++iter) {
-    IdentifyMatrixArgs(&(*iter), &this_matrix_args);
-    matrix_args->insert(matrix_args->end(),
-                        this_matrix_args.begin(),
-                        this_matrix_args.end());
-  }
-}
-
 
-void IdentifyMatrixArgsInComputation(bool include_in_submatrices,
-                                     NnetComputation *computation,
+void IdentifyMatrixArgsInComputation(NnetComputation *computation,
                                      std::vector<int32*> *matrix_args) {
-  IdentifyMatrixArgs(&(computation->commands), matrix_args);
   int32 num_submatrices = computation->submatrices.size();
-  matrix_args->reserve(matrix_args->size() +
-                       (include_in_submatrices ?
-                        computation->submatrices.size() : 0) +
-                       2 * computation->input_output_info.size());
-  if (include_in_submatrices)
-    for (int32 s = 1; s < num_submatrices; s++)
-      matrix_args->push_back(&(computation->submatrices[s].matrix_index));
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    matrix_args->push_back(&(iter->second.first));
-    matrix_args->push_back(&(iter->second.second));
-  }
+  matrix_args->reserve(computation->submatrices.size());
+  for (int32 s = 1; s < num_submatrices; s++)
+    matrix_args->push_back(&(computation->submatrices[s].matrix_index));
 }
 
 
@@ -165,26 +147,112 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
   }
 }
 
-
-
-void IdentifyMatrixArgs(NnetComputation::Command *c,
-                        std::vector<int32*> *matrix_args) {
-  matrix_args->clear();
-  switch (c->command_type) {
-    case kAllocMatrixZeroed:
-    case kAllocMatrixUndefined:
-    case kDeallocMatrix:
-      matrix_args->push_back(&c->arg1);
-      break;
-    case kAllocMatrixFromOther:
-    case kAllocMatrixFromOtherZeroed:
-      matrix_args->push_back(&c->arg1);
-      matrix_args->push_back(&c->arg2);
-      break;
-    default:
-      break;
-  }
-}
+// We declare this class in the .cc file, we don't need to export it.
+// It's used inside RenumberComputation.
+class ComputationRenumberer {
+ public:
+  ComputationRenumberer(NnetComputation *computation):
+      computation_(computation) { }
+
+  void Renumber();
+ private:
+  // this function removes unused vectors within the indexes_multi_ array, i.e.
+  // ones that are not referenced in the computation.
+  void RemoveUnusedIndexesMulti();
+  // this function computes the submatrix_is_used_ vector, saying whether each
+  // of the original submatrices is referenced somewhere.
+  void ComputeSubmatrixIsUsed();
+  // this function computes the matrix_is_used_ vector (from the
+  // submatrix_is_used_ vector, from computation_->input_output_info, and from
+  // computation_->commands, saying whether each of the original matrices is
+  // referenced somewhere, directly or indirectly.
+  void ComputeMatrixIsUsed();
+  // This function sets up mappings from old to new matrix and submatrix indexes,
+  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
+  void SetUpMappings();
+  // This function renumbers submatrix indexes appearing within commands and
+  // indexes_multi_, and then removes unused submatrices from the list of
+  // submatrices while leaving the matrix-indexes at their old values (they will
+  // be mapped by RenumberMatrices()).
+  void RenumberSubmatrices();
+  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
+  // and 'input_output_info'; renumber 'matrices' and if applicable
+  // 'debug_info'.
+  void RenumberMatrices();
+  // removes duplicates within the indexes_multi array itself.
+  void RemoveIndexesMultiDuplicates();
+  // removes unused elements and duplicates within 'computation->indexes'
+  void RenumberIndexes();
+  // removes unused elements and duplicates within 'computation->indexes_ranges'
+  void RenumberIndexesRanges();
+
+  struct SubMatrixHasher {
+    SubMatrixHasher() { }
+    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
+      // these numbers are arbitrarily chosen primes.
+      return submat.matrix_index +
+          19553 * submat.row_offset +
+          29297 * submat.num_rows +
+          42209 * submat.col_offset +
+          56527 * submat.num_cols;
+    }
+  };
+
+
+  // Here, T will be int32 or std::pair<int32,int32>
+  template <class T>
+  struct PointerCompare {
+    // This provides an operator < on two vectors of ints or pairs of ints.  It
+    // is designed to provide a total order on the vectors while accessing as
+    // small a portion of the vectors' data as possible.  It's used in removing
+    // duplicates from computation_->indexes_multi and computation_->indexes.
+    // First it compares the length, then it does lexicographical compare.
+    bool operator ()(const std::vector<T> *ptr1,
+                     const std::vector<T> *ptr2) const {
+      size_t size1 = ptr1->size(), size2 = ptr2->size();
+      if (size1 < size2) return true;
+      else if (size1 > size2) return false;
+      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
+                                    // lexicographical comparison.
+    }
+  };
+
+  /// creates a renumbering that removes the elements in "to_remove",
+  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
+  /// the vector [ 0, -1, 1 ].
+  static void CreateRenumbering(int32 old_num_elements,
+                                const std::vector<int32> &to_remove,
+                                std::vector<int32> *renumbering);
+
+  /// creates a renumbering from old to new index that removes the unused
+  /// elements, e.g. if used == [ true, false, true, true], would output the
+  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
+  /// number of elements of 'used' that were true.
+  static int32 CreateRenumbering(const std::vector<bool> &used,
+                                 std::vector<int32> *renumbering);
+
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index is used somewhere in the computation (always true for
+  // the zeroth element).
+  std::vector<bool> submatrix_is_used_;
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index will be kept; this is like submatrix_is_used_; but for
+  // duplicate submatrices, all but the first duplicate will be marked false).
+  std::vector<bool> submatrix_is_kept_;
+  // vector of bool indexed by original-matrix-index > 0, that is true if a
+  // matrix-index is used somewhere in the computation, directly or indirectly.
+  // always true for the zeroth element.
+  std::vector<bool> matrix_is_used_;
+  NnetComputation *computation_;
+  int32 num_matrices_new_;
+  int32 num_submatrices_new_;
+  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
+                                         // new-matrix-index.  -1 for removed
+                                         // ones.
+  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
+                                            // gives new-submatrix-index.  -1
+                                            // for removed ones.
+};
 
 // static
 int32 ComputationRenumberer::CreateRenumbering(
@@ -276,22 +344,10 @@ void ComputationRenumberer::ComputeMatrixIsUsed() {
   matrix_is_used_.clear();
   matrix_is_used_.resize(computation_->matrices.size(), false);
   matrix_is_used_[0] = true;
-
-  std::vector<int32*> matrix_args;
-  bool include_in_submatrices = false;
-  IdentifyMatrixArgsInComputation(include_in_submatrices,
-                                  computation_, &matrix_args);
-  std::vector<int32*>::iterator iter = matrix_args.begin(),
-      end = matrix_args.end();
-  for (; iter != end; ++iter) {
-    int32 matrix_index = **iter;
-    if (matrix_index > 0)
-      matrix_is_used_[matrix_index] = true;
-  }
   // We also need to take into account when matrices are used indirectly via
   // submatrices (which is actually the main way they are accessed).
-  int32 num_submatrices_orig = computation_->submatrices.size();
-  for (int32 s = 1; s < num_submatrices_orig; s++) {
+  int32 num_submatrices = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
     int32 matrix_index = computation_->submatrices[s].matrix_index;
     if (submatrix_is_used_[s])
       matrix_is_used_[matrix_index] = true;
@@ -355,20 +411,15 @@ void ComputationRenumberer::RenumberSubmatrices() {
 
 void ComputationRenumberer::RenumberMatrices() {
   std::vector<int32*> matrix_args;
-  bool include_in_submatrices = true;
-  IdentifyMatrixArgsInComputation(include_in_submatrices,
-                                  computation_, &matrix_args);
-  std::vector<int32*>::iterator iter = matrix_args.begin(),
-      end = matrix_args.end();
-  for (; iter != end; ++iter) {
-    if (**iter > 0) {
-      int32 new_matrix_index = old_to_new_matrix_[**iter];
-      // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
-      // submatrices that are never accessed, and these should never appear
-      // in this list.
-      KALDI_ASSERT(new_matrix_index > 0);
-      **iter = new_matrix_index;
-    }
+  int32 num_submatrices = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
+    int32 *matrix_index = &(computation_->submatrices[s].matrix_index);
+    // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
+    // submatrices that are never accessed, and these should never appear
+    // in this list.  (presumably because we renumber the submatrices first).
+    int32 new_matrix_index = old_to_new_matrix_[*matrix_index];
+    KALDI_ASSERT(new_matrix_index > 0);
+    *matrix_index = new_matrix_index;
   }
 
   std::vector<NnetComputation::MatrixInfo> new_matrices;
@@ -601,6 +652,7 @@ void RenumberComputation(NnetComputation *computation) {
   renumberer.Renumber();
 }
 
+
 void RemoveNoOps(NnetComputation *computation) {
   std::vector<NnetComputation::Command>::iterator
       input_iter = computation->commands.begin(),
@@ -615,87 +667,12 @@ void RemoveNoOps(NnetComputation *computation) {
   computation->commands.resize(output_iter - computation->commands.begin());
 }
 
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet,
-    int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation) {
-  bool ans = false;
-  int32 num_matrices = computation->matrices.size();
-  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
-               new_matrix_index > 0 && new_matrix_index < num_matrices);
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 network_node = iter->first,
-        &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    if (nnet.IsOutputNode(network_node)) {
-      // deriv_matrix_index would be an input to the computation.
-      if (deriv_matrix_index == orig_matrix_index) {
-        deriv_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    } else {
-      // value_matrix_index would be an input to the computation.
-      if (value_matrix_index == orig_matrix_index) {
-        value_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    }
-  }
-  return ans;
-}
-
-
-/// Wherever matrix orig_matrix_index appears in the output of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInOutput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation) {
-  bool ans = false;
-  int32 num_matrices = computation->matrices.size();
-  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
-               new_matrix_index > 0 && new_matrix_index < num_matrices);
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 network_node = iter->first,
-        &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    if (nnet.IsOutputNode(network_node)) {
-      // value_matrix_index would be an output of the computation.
-      if (value_matrix_index == orig_matrix_index) {
-        value_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    } else {
-      // deriv_matrix_index would be an output of the computation.
-      if (deriv_matrix_index == orig_matrix_index) {
-        // we'd only have derivatives for actual inputs. [note: we also allow
-        // users to provide inputs for component nodes, but these would not have
-        // derivatives.]
-        KALDI_ASSERT(nnet.IsInputNode(network_node));
-        deriv_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    }
-  }
-  return ans;
-}
-
 
 VariableMergingOptimizer::VariableMergingOptimizer(
     const NnetOptimizeOptions &config,
     const Nnet &nnet,
-    const ComputationRequest &request,
     NnetComputation *computation):
-    config_(config), nnet_(nnet), request_(request),
+    config_(config), nnet_(nnet),
     computation_(computation),
     already_called_merge_variables_(false) {
   analyzer_.Init(nnet, *computation);
@@ -714,8 +691,7 @@ bool VariableMergingOptimizer::MergeVariables() {
        command_index++) {
     // This loop looks for pairs of sub-matrix indexes s1,s2 that we could
     // potentially merge into a single variable.
-    const NnetComputation::Command &c =
-        computation_->commands[command_index];
+    const NnetComputation::Command &c = computation_->commands[command_index];
     int32 s1 = -1, s2 = -1;
     if (c.command_type == kMatrixCopy &&
         config_.remove_assignments) {
@@ -747,10 +723,10 @@ bool VariableMergingOptimizer::MergeVariables() {
     if (s1 > 0 && s2 > 0) {
       std::pair<bool,bool> p = MayBeMerged(command_index, s1, s2);
       if (p.first) {
-        DoLeftMerge(command_index, s1, s2);
+        DoMerge(command_index, s1, s2);
         merged = true;
       } else if (p.second) {
-        DoRightMerge(command_index, s1, s2);
+        DoMerge(command_index, s2, s1);
         merged = true;
       }
     }
@@ -800,45 +776,33 @@ void VariableMergingOptimizer::MarkAsDirty(int32 s) {
   }
 }
 
-void VariableMergingOptimizer::DoRightMerge(int32 command_index,
-                                            int32 s1, int32 s2) {
-  // Prevent further optimizations touching s1 or s2 (we can
-  // try again in a later round of optimization, with a new
-  // instance of this class).
-  MarkAsDirty(s1);
-  MarkAsDirty(s2);
-
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
-  { // modify submatrices for submatrices of m1 to effectively be sub-matrices of
-    // s2 instead (they will refer to m2 as the matrix_index).
-    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m1].begin(),
-        end = matrix_to_submatrix_[m1].end();
+void VariableMergingOptimizer::DoMerge(int32 command_index,
+                                       int32 s_to_keep,
+                                       int32 s_to_discard) {
+  // Prevent further optimizations touching either submatrix (we can try again
+  // in a later round of optimization, with a new instance of this class).
+  MarkAsDirty(s_to_keep);
+  MarkAsDirty(s_to_discard);
+
+  int32 m_to_keep = computation_->submatrices[s_to_keep].matrix_index,
+      m_to_discard = computation_->submatrices[s_to_discard].matrix_index;
+  KALDI_ASSERT(m_to_keep != m_to_discard && m_to_keep > 0 && m_to_discard > 0);
+
+  { // modify submatrices of m_to_discard to effectively be sub-matrices of
+    // s_to_keep instead (they will refer to m_to_keep as the matrix_index).
+    std::vector<int32>::const_iterator iter =
+        matrix_to_submatrix_[m_to_discard].begin(),
+        end = matrix_to_submatrix_[m_to_discard].end();
     for (; iter != end; ++iter) {
       int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m1);
+      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index
+                   == m_to_discard);
       computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s2);
+          GetSubMatrixOfSubMatrix(*computation_, submatrix_index,
+                                  s_to_keep);
     }
   }
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  // - If m1 was an input, replace it as an input with m2
-  bool replaced = ReplaceInInput(nnet_, m1, m2, computation_);
-  KALDI_ASSERT(replaced == matrix_accesses[m1].is_input);
-  if (replaced) {  // Remove the command that allocates m2.
-    int32 alloc_command = matrix_accesses[m2].allocate_command;
-    KALDI_ASSERT(alloc_command != -1);
-    computation_->commands[alloc_command].command_type =
-        kNoOperation;
-  }
-  // we keep matrix m2 (so m2 is m_to_keep, m1 is m_to_discard).
-  DoMergeCommon(command_index, m2, m1);
-}
-
-void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
-                                             int32 m_to_keep,
-                                             int32 m_to_discard) {
+
   ComputationAnalysis analysis(*computation_, analyzer_);
   NnetComputation::Command &c = computation_->commands[command_index];
   const std::vector<MatrixAccesses> &matrix_accesses =
@@ -852,52 +816,59 @@ void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
     c.arg2 = -1;
   }
 
-  //   - If both m_to_keep and m_to_discard have commands that deallocate them,
-  //    keep only the allocation command for m_to_keep, and make sure it's after
-  //    the last access of m_to_discard (otherwise delete any deallocation
-  //    command).
+  //   We want to ensure that there is only one deallocation command.
+  //   If neither matrix is an output, then there will be 2 deallocation
+  //   commands and we keep the one for m_to_keep (which, if the sizes
+  //   differ, will be the larger of the two, so it's the one whose
+  //   submatrix index refers to the entirety of the matrix).
+  //   If one of them is an output, then remove the deallocation command
+  //   of whichever one is not an output.
+  //   As a simplification to the logic above: if the 'discard' matrix
+  //   has a deallocation command (i.e. if that matrix was not an output)
+  //   then remove it; otherwise remove the deallocation command of
+  //   the 'keep' matrix.
+
   int32 dealloc_keep = matrix_accesses[m_to_keep].deallocate_command,
       dealloc_discard = matrix_accesses[m_to_discard].deallocate_command;
-  if (dealloc_keep != -1 && dealloc_discard != -1) {
-    KALDI_ASSERT(analysis.LastMatrixAccess(m_to_discard) < dealloc_keep);
+  if (dealloc_discard != -1) {
     computation_->commands[dealloc_discard].command_type = kNoOperation;
   } else {
-    if (dealloc_keep != -1)
-      computation_->commands[dealloc_keep].command_type =
-          kNoOperation;
-    if (dealloc_discard != -1)
-      computation_->commands[dealloc_discard].command_type =
-          kNoOperation;
-  }
-
-  //   - If both m_to_keep and m_to_discard have commands that allocate them,
-  //     keep only the allocation command for m_to_keep and make sure it's
-  //     before the first access of m_to_discard.
-  //     (otherwise delete any allocation command).
-  int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
-      alloc_discard = matrix_accesses[m_to_discard].allocate_command;
-  if (alloc_keep != -1 && alloc_discard != -1) {
+    KALDI_ASSERT(dealloc_keep != -1);
+    computation_->commands[dealloc_keep].command_type = kNoOperation;
+  }
+
+  {
+    //   - Both m_to_keep and m_to_discard will have commands that allocate
+    //     them, as all matrices do (note, kAcceptInput counts as an allocation
+    //     command).  If one of them is kAcceptInput, then delete the other one.
+    //     Otherwise delete the "discard" one.  As a simplification of the logic
+    //     of the previous sentence: if the "discard" allocate command is
+    //     kAcceptInput then delete the "keep" allocate command, else delete
+    //     the "discard" allocate command.
+    //     Note: after we renumber the submatrices, they both refer to the
+    //     same underlying matrix, but we need to refer to them using a
+    //     submatrix that refers to the entire matrix.  The one we keep will
+    //     always refer to the entire matrix.  (In the case where one of
+    //     them is an input, both submatrices are guaranteed to refer to the
+    //     entire matrix).
+    int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
+        alloc_discard = matrix_accesses[m_to_discard].allocate_command;
+
+    KALDI_ASSERT(alloc_keep != -1 && alloc_discard != -1);
     KALDI_ASSERT(analysis.FirstMatrixAccess(m_to_discard) > alloc_keep);
+
     NnetComputation::Command
         &keep_alloc_command = computation_->commands[alloc_keep],
         &discard_alloc_command = computation_->commands[alloc_discard];
-    discard_alloc_command.command_type = kNoOperation;
-    if (keep_alloc_command.command_type == kAllocMatrixUndefined) {
-      keep_alloc_command.command_type = kAllocMatrixZeroed;
-    } else if (keep_alloc_command.command_type == kAllocMatrixFromOther) {
-      keep_alloc_command.command_type = kAllocMatrixFromOtherZeroed;
+    if (discard_alloc_command.command_type == kAcceptInput) {
+      keep_alloc_command.command_type = kNoOperation;
+    } else {
+      discard_alloc_command.command_type = kNoOperation;
     }
-  } else {
-    if (alloc_keep != -1)
-      computation_->commands[alloc_keep].command_type =
-          kNoOperation;
-    if (alloc_discard != -1)
-      computation_->commands[alloc_discard].command_type =
-          kNoOperation;
   }
 
   //  If the matrix to discard had stride_type == kStrideEqualNumCols, set the
-  //  matrix to keep's stride_type to kStrideEqualNuMCols.
+  //  matrix to keep's stride_type to kStrideEqualNumCols.
   if (computation_->matrices[m_to_discard].stride_type == kStrideEqualNumCols) {
     computation_->matrices[m_to_keep].stride_type = kStrideEqualNumCols;
     // ... and perform an additional check.
@@ -908,43 +879,6 @@ void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
   }
 }
 
-void VariableMergingOptimizer::DoLeftMerge(int32 command_index,
-                                           int32 s1, int32 s2) {
-  // Prevent further optimizations touching s1 or s2 (we can
-  // try again in a later round of optimization, with a new
-  // instance of this class).
-  MarkAsDirty(s1);
-  MarkAsDirty(s2);
-
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
-  { // modify submatrices for submatrices of m2 to effectively be sub-matrices of
-    // s1 instead (they will refer to m1 as the matrix_index).
-    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m2].begin(),
-        end = matrix_to_submatrix_[m2].end();
-    for (; iter != end; ++iter) {
-      int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m2);
-      computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s1);
-    }
-  }
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  // - If m2 was an output, replace it as an input with m1.
-  bool replaced = ReplaceInOutput(nnet_, m2, m1, computation_);
-  KALDI_ASSERT(replaced == matrix_accesses[m2].is_output);
-  if (replaced) {  // Remove the command that deallocates m1.
-    int32 dealloc_command = matrix_accesses[m1].deallocate_command;
-    KALDI_ASSERT(dealloc_command != -1);
-    computation_->commands[dealloc_command].command_type =
-        kNoOperation;
-  }
-  // we keep matrix m1 (so m1 is m_to_keep, m2 is m_to_discard).
-  DoMergeCommon(command_index, m1, m2);
-}
-
-
 
 
 std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
@@ -1015,6 +949,77 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
   return std::pair<bool,bool>(false,false);
 }
 
+
+/** This class is responsible for consolidating the model-update part of
+    backprop commands, for components in (e.g.) recurrent networks that need to
+    have many separate backprop commands, into more efficient single commands
+    operating on consolidated data in larger matrices.  This is useful for
+    recurrent networks.  */
+class ModelUpdateConsolidator {
+ public:
+  ModelUpdateConsolidator(const Nnet &nnet,
+                          NnetComputation *computation);
+  void ConsolidateModelUpdate();
+ private:
+  void ConsolidateUpdateForComponent(
+      int32 component,
+      const std::vector<int32> &backprop_commands);
+
+  /// This function, called at the end of ConsolidateModelUpdate(), takes the
+  /// commands that we have put in extra_commands_, final_commands_ and
+  /// final_deallocate_commands_, and puts them in the appropriate place in
+  /// computation->commands_.
+  void AddCommandsToComputation();
+
+  /// You call this function when you want to consolidate the values of a list
+  /// of submatrices taken just prior to particular commands.  The input
+  /// 'commands' and 'submatrices' lists must be the same size, and size must be
+  /// > 1.  This function will create a new matrix that is the row-wise
+  /// concatentation of all these submatrices, with values taken just prior to
+  /// the respective command indexes.  This function will will add to
+  /// extra_commands_ the commands to do the copying at the appropriate places
+  /// (at the supplied command indexes; they will be inserted just before).  The
+  /// return value is the submatrix index of a submatrix that represents the
+  /// whole of the consolidated matrix.  This command will insert, at the
+  /// beginning of the computation (in extra_commands_[0]), a command to
+  /// initialize the matrix; and will append to final_deallocate_commands_ the
+  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
+  /// nonempty, this function will also update computation_->matrix_debug_info
+  /// with suitable values for the newly added matrix
+  int32 ConsolidateSubmatrices(
+      const std::vector<int32> &commands,
+      const std::vector<int32> &submatrices);
+
+  /// This function, called from ConsolidateSubmatrices, will
+  /// update 'debug_info' by appending the corresponding 'indexes' from
+  /// the existing debug info for this submatrix.  It will also set
+  /// the 'is_deriv' of '*debug_info' to the same value as the
+  /// debug info for 'submatrix_index', and set the 'node_index' to the
+  /// 'node_index' in the debug info for that submatrix-index.
+  /// It requires that computation_->matrix_debug_info be nonempty.
+  void AppendDebugInfoForSubmatrix(
+      int32 submatrix_index,
+      NnetComputation::MatrixDebugInfo *debug_info) const;
+
+  const Nnet &nnet_;
+  NnetComputation *computation_;
+
+  // Indexed by the original command index in *computation_ (and sized to the
+  // original number of commands in *computation_ before we added anything),
+  // extra_commands_[c] contains a list of commands that need to be inserted
+  // just before command c in the previously existing computation.
+  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
+
+  // This is as list of kBackprop commands that will be placed after the
+  // commands in 'computation_->commands' and 'extra_commands_', but before
+  // the 'final_deallocate_commands_'.
+  std::vector<NnetComputation::Command> final_commands_;
+  // This is a list of commands to deallocate our 'consolidated' matrices; the
+  // commands will be placed after the commands in 'final_commands_'.
+  std::vector<NnetComputation::Command> final_deallocate_commands_;
+};
+
+
 void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
     int32 submatrix_index,
     NnetComputation::MatrixDebugInfo *debug_info) const {
@@ -1038,7 +1043,6 @@ void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
                              src_info.cindexes.begin() + row_end);
 }
 
-
 // see comment by declaration in header.
 int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     const std::vector<int32> &commands,
@@ -1067,14 +1071,14 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
   int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols,
                                                       stride_type);
   // Add a command at the very start, to initialize this new matrix.
-  int32 new_matrix_index =
-      computation_->submatrices[new_whole_submatrix].matrix_index;
   // we can later on optimize this zeroed initialization to an undefined
   // initialization.
   extra_commands_[0].push_back(
-      NnetComputation::Command(kAllocMatrixZeroed, new_matrix_index));
+      NnetComputation::Command(kAllocMatrixZeroed, new_whole_submatrix));
   final_deallocate_commands_.push_back(
-      NnetComputation::Command(kDeallocMatrix, new_matrix_index));
+      NnetComputation::Command(kDeallocMatrix, new_whole_submatrix));
+  int32 new_matrix_index =
+      computation_->submatrices[new_whole_submatrix].matrix_index;
   if (!computation_->matrix_debug_info.empty())
     computation_->matrix_debug_info[new_matrix_index].Swap(&debug_info);
 
@@ -1091,7 +1095,7 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     // submatrix numbered 'new_submatrix' the contents of the submatrix numbered
     // 'submatrices[i]'.  Note: we hope that a later pass of optimization
     // (VariableMergingOptimization) will remove this redundant copy by
-    // having the operation that created it right directly to the location
+    // having the operation that created it write directly to the location
     // we want it to be.
     NnetComputation::Command c(kMatrixCopy, new_submatrix, submatrices[i]);
     extra_commands_[commands[i]].push_back(c);
@@ -1212,6 +1216,19 @@ void ModelUpdateConsolidator::ConsolidateModelUpdate() {
   AddCommandsToComputation();
 }
 
+
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            NnetComputation *computation) {
+  // This following if-statement is an optimization: if the computation
+  // request(s) had need_model_derivative == false, there would be nothing to
+  // optimize, so don't bother trying.
+  if (!computation->need_model_derivative)
+    return;
+  ModelUpdateConsolidator consolidator(nnet, computation);
+  consolidator.ConsolidateModelUpdate();
+}
+
+
 // inline
 void DerivativeTimeLimiter::GetPruneValues(int32 initial_submatrix,
                                            int32 new_submatrix,
@@ -1295,8 +1312,8 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
         command->arg5 = mapped_output_deriv_submatrix;
         command->arg6 = mapped_input_deriv_submatrix;
       }
-    }
       break;
+    }
     case kMatrixCopy: case kMatrixAdd:
       MapSimpleMatrixCommand(command);
       break;
@@ -1311,6 +1328,7 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
       MapAddRowRangesCommand(command);
       break;
     }
+    case kAcceptInput: case kProvideOutput:
     case kNoOperation: case kNoOperationMarker:
       break;
     default:
@@ -1333,7 +1351,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     c->command_type = kNoOperation;
     return;
   }
-  // left_prune1 is the nmber of rows pruned away on the left for submatrix1.
+  // left_prune1 is the number of rows pruned away on the left for submatrix1.
   int32 orig_num_rows = computation_->submatrices[submatrix1].num_rows,
       left_prune1, left_prune2, right_prune1, right_prune2;
   GetPruneValues(submatrix1, submatrix1_mapped, &left_prune1, &right_prune1);
@@ -1355,7 +1373,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     } else {
       int32 num_rows = orig_num_rows - left_prune - right_prune;
       // note: the call NewSubMatrix effectively gives us a sub-matrix of a
-      // subm-matrix.
+      // sub-matrix.
       c->arg1 = computation_->NewSubMatrix(submatrix1,
                                            left_prune, num_rows, 0, -1);
       c->arg2 = computation_->NewSubMatrix(submatrix2,
@@ -1565,7 +1583,7 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
       max_deriv_time_ == std::numeric_limits<BaseFloat>::max())
     return;  // nothing to do.
 
-  EnsureMatricesHaveEntireSubmatrices();
+  computation_->GetWholeSubmatrices(&whole_submatrices_);
   ComputeMatrixPruneInfo();
   ComputeSubmatrixMaps();
   ModifyCommands();
@@ -1574,20 +1592,6 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
   RenumberComputation(computation_);
 }
 
-void DerivativeTimeLimiter::EnsureMatricesHaveEntireSubmatrices() {
-  int32 num_matrices = computation_->matrices.size(),
-      num_submatrices = computation_->submatrices.size();
-  entire_submatrix_.clear();
-  entire_submatrix_.resize(num_matrices, -1);
-  entire_submatrix_[0] = 0;
-  for (int32 s = 1; s < num_submatrices; s++)
-    if (computation_->IsWholeMatrix(s))
-      entire_submatrix_[computation_->submatrices[s].matrix_index] = s;
-  for (int32 m = 1; m < num_matrices; m++)
-    if (entire_submatrix_[m] == -1)
-      entire_submatrix_[m] = computation_->NewSubMatrix(m, 0, -1, 0, -1);
-}
-
 void DerivativeTimeLimiter::ComputeMatrixPruneInfo() {
   KALDI_ASSERT(computation_->matrix_debug_info.size() ==
                computation_->matrices.size() &&
@@ -1688,20 +1692,20 @@ void DerivativeTimeLimiter::ModifyCommands() {
 // desired range are never accessed), and false otherwise.
 bool DerivativeTimeLimiter::CanLimitMatrix(const Analyzer &analyzer,
                                            int32 m) const {
-  int32 s_entire = entire_submatrix_[m];  // submatrix consisting of
+  int32 s_whole = whole_submatrices_[m];  // submatrix consisting of
                                                      // all of the matrix.
-  int32 s_mapped = submatrix_map_[s_entire];  // the matrix limited in time.
-  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_entire);
-  std::vector<int32> entire_variables, mapped_variables;
-  analyzer.variables.AppendVariablesForSubmatrix(s_entire,
-                                                 &entire_variables);
+  int32 s_mapped = submatrix_map_[s_whole];  // the matrix limited in time.
+  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_whole);
+  std::vector<int32> whole_variables, mapped_variables;
+  analyzer.variables.AppendVariablesForSubmatrix(s_whole,
+                                                 &whole_variables);
   analyzer.variables.AppendVariablesForSubmatrix(s_mapped,
                                                  &mapped_variables);
-  KALDI_ASSERT(entire_variables.size() > mapped_variables.size());
-  std::vector<int32> excluded_variables(entire_variables.size() -
+  KALDI_ASSERT(whole_variables.size() > mapped_variables.size());
+  std::vector<int32> excluded_variables(whole_variables.size() -
                                         mapped_variables.size());
   std::vector<int32>::iterator end_iter =
-      std::set_difference(entire_variables.begin(), entire_variables.end(),
+      std::set_difference(whole_variables.begin(), whole_variables.end(),
                           mapped_variables.begin(), mapped_variables.end(),
                           excluded_variables.begin());
   KALDI_ASSERT(end_iter == excluded_variables.end());
@@ -1750,15 +1754,24 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
         // rows to the left.
         submat_info.row_offset = new_row_begin;
       } else {
-        // This submatrix is not entirely the kept range of the matrix.
-        // We assume that this submatrix is never accessed directly (as when
-        // we modified the computation we ensured this).  We
-        // give it a valid but stupid size of num-rows=1, num-cols=1, so
-        // that if it ever does get accessed it should produce an error.
-        submat_info.row_offset = 0;
-        submat_info.num_rows = 1;
-        submat_info.col_offset = 0;
-        submat_info.num_cols = 1;
+        // This submatrix is not entirely inside the kept range of the matrix.
+        // We assume that this submatrix is never accessed directly except (if
+        // it was the whole matrix) for in allocation and deallocation commands,
+        // since when we modified the computation we ensured this.
+        if (computation_->IsWholeMatrix(s)) {
+          // If it was the whole matrix then it may be used in allocation and
+          // deallocation commands, so we should modify it to be the whole of the
+          // new matrix, which will have fewer rows than before.
+          submat_info.num_rows = matrix_num_rows;
+        } else {
+          // We believe this matrix should never be used.  We give it a valid
+          // but stupid size of num-rows=1, num-cols=1, so that if it ever does
+          // get accessed it should produce an error.
+          submat_info.row_offset = 0;
+          submat_info.num_rows = 1;
+          submat_info.col_offset = 0;
+          submat_info.num_cols = 1;
+        }
       }
     }
   }
@@ -1785,7 +1798,7 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
 void DerivativeTimeLimiter::PruneMatrices() {
   Analyzer analyzer;
   analyzer.Init(nnet_, *computation_);
-  KALDI_ASSERT(computation_->matrices.size() == entire_submatrix_.size());
+  KALDI_ASSERT(computation_->matrices.size() == whole_submatrices_.size());
   int32 num_matrices = computation_->matrices.size();
   std::vector<bool> will_limit(num_matrices, false);
   bool will_limit_at_least_one = false;
@@ -1830,22 +1843,6 @@ void DerivativeTimeLimiter::PruneMatrices() {
 }
 
 
-int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
-  int32 ans = std::numeric_limits<int32>::min();
-  for (size_t i = 0; i < request.outputs.size(); i++) {
-    std::vector<Index> indexes &indexes = request.outputs[i].indexes;
-    std::vector<Index> indexes::const_iterator iter = indexes.begin(),
-        end = indexes.end();
-    for (; iter != end; ++iter)
-      if (iter.t > ans)
-        ans = iter.t;
-  }
-  if (ans == std::numeric_limits<int32>::min()) {
-    KALDI_ERR << "Failed to find any output indexes in computation request.";
-  }
-  return ans;
-}
-
 void LimitDerivativeTimes(const Nnet &nnet,
                           int32 min_deriv_time,
                           int32 max_deriv_time,
@@ -1855,5 +1852,1812 @@ void LimitDerivativeTimes(const Nnet &nnet,
   limiter.LimitDerivTimes();
 }
 
+
+/*
+  This helper function, used in ReplaceRowWithMatrixOps, detects
+  when the vector 'indexes' has a 'special structure'.  The special structure
+  is:
+    zero or more -1's, then
+    a consecutive nonempty sequence of nonnegative numbers, e.g. 6 7 8 9 10, then
+    zero or more -1's.
+
+  Note: this function assumes that any negative elements of 'indexes' are -1.
+  If there are elements less than -1, then it is an error, but this function
+  does not thoroughly check for that.  'indexes' is required to be a nonempty
+  vector.
+
+  If 'indexes' has the special structure then this function returns true
+  and sets the following values, which will explain with the following
+  example in mind: 'indexes = [ -1, -1, 5 6 7 8, -1 ]'.
+     - '*first_nonnegative_pos' is set to the number of initial -1's (and also
+       the location of the first nonnegative element): 2 in this case.
+     - '*first_nonnegative_value' is set to the value of the first nonnegative
+       element (5 in this case)
+     - '*num_nonnegative_values' is set to the number of nonnegative values in
+       the sequence (4 in this case).
+  If 'indexes' does not have this special structure, then this function returns
+  false, and the values of '*first_nonnegative_pos',
+  '*first_nonnegative_value' and '*num_nonnegative_indexes' on exit are
+  undefined.
+*/
+static bool IndexesHaveSpecialStructure(const std::vector<int32> &indexes,
+                                        int32 *first_nonnegative_pos,
+                                        int32 *first_nonnegative_value,
+                                        int32 *num_nonnegative_indexes) {
+  KALDI_ASSERT(!indexes.empty());
+  const int32 *indexes_ptr = &(indexes[0]);
+  size_t pos = 0, size = indexes.size();
+
+  // Find the first nonnegative element of 'indexes'.
+  for (; pos < size; ++pos)
+    if (indexes_ptr[pos] >= 0)
+      break;
+  if (pos == size)
+    return false;  // all -1's... should not happen, but not our problem.
+  *first_nonnegative_pos = static_cast<int32>(pos);
+  int32 n = indexes_ptr[pos];
+  *first_nonnegative_value = n;
+  // Find the first element after '*first_nonnegative_index' that isn't
+  // consecutive.
+  for (; pos < size; ++pos,++n)
+    if (indexes_ptr[pos] != n)
+      break;
+
+  *num_nonnegative_indexes = n - *first_nonnegative_value;
+
+  // Check that the remaining values are all <0 (assumed equal to -1, but
+  // checking <0 may be faster as just one instruction).
+  for (; pos < size; ++pos)
+    if (indexes_ptr[pos] >= 0)
+      return false;  // does not have the special structure.
+
+  return true;
+}
+
+
+
+bool ReplaceRowWithMatrixOps(NnetComputation *computation) {
+  bool ans = false;
+  int32 num_commands = computation->commands.size(),
+      num_indexes = computation->indexes.size();
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    // non-const because we'll be changing it.
+    NnetComputation::Command &c = computation->commands[command_index];
+
+    int32 first_nonnegative_pos,
+        first_nonnegative_value,
+        num_nonnegative_indexes;
+    switch (c.command_type) {
+      case kCopyRows: case kAddRows: {
+        int32 indexes_index = c.arg3;
+        KALDI_ASSERT(indexes_index < num_indexes);
+        const std::vector<int32> &indexes = computation->indexes[indexes_index];
+        if (IndexesHaveSpecialStructure(indexes,
+                                        &first_nonnegative_pos,
+                                        &first_nonnegative_value,
+                                        &num_nonnegative_indexes)) {
+          ans = true;
+          c.arg1 = computation->NewSubMatrix(c.arg1, first_nonnegative_pos,
+                                             num_nonnegative_indexes,
+                                             0, -1);
+          c.arg2 = computation->NewSubMatrix(c.arg2, first_nonnegative_value,
+                                             num_nonnegative_indexes,
+                                             0, -1);
+          c.command_type = (c.command_type == kCopyRows ? kMatrixCopy :
+                            kMatrixAdd);
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  return ans;
+}
+
+
+
+/*
+  This function, used in SnipSingleRowOp(),
+  finds the number of leading, and trailing, negative numbers
+  in a vector of integers.  For instance, if vec is
+    [ -1 -1 2 3 -1 4 5 -1 ]
+  then '*num_leading_negatives' will be set to 2 and '*num_trailing_negatives'
+  will be set to 1.  If all the numbers in 'vec' are all negative, or 'vec' is
+  empty, it is an error and this function will invoke KALDI_ERR.
+*/
+static void FindNumLeadingAndTrailingNegatives(const std::vector<int32> &vec,
+                                               int32 *num_leading_negatives,
+                                               int32 *num_trailing_negatives) {
+  KALDI_ASSERT(!vec.empty());
+  const int32 *begin = &(vec[0]), *ptr = begin, *end = ptr + vec.size();
+  while (ptr != end && *ptr < 0)
+    ptr++;
+  // note regarding error message: we assume all negative numbers are -1, due to
+  // the way this is called, but it only affects how we describe the error.
+  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
+  *num_leading_negatives = ptr - begin;
+  const int32 *ptr2 = end - 1;
+  // the following while loop should terminate before falling off the vector,
+  // because we've established above (in the assertion) that the vector contains
+  // at least one nonnegative number.
+  while (*ptr2 < 0)
+    ptr2--;
+  KALDI_ASSERT(ptr2 != begin);  // would be code error.
+  *num_trailing_negatives = end - 1 - ptr2;
+}
+
+// This function, called from SnipRowOps, is called when it encounters commands
+// of type kCopyRows or kAddRows; it modifies such commands when the indexes
+// have leading or trailing -1's,h, to make them operate on a smaller submatrix.
+// It returns true if it made a change, and false otherwise.
+static bool SnipSingleRowOp(NnetComputation *computation,
+                            int32 command_index) {
+  NnetComputation::Command &c = computation->commands[command_index];
+  KALDI_ASSERT(static_cast<size_t>(c.arg3) < computation->indexes.size());
+  const std::vector<int32> &indexes = computation->indexes[c.arg3];
+  int32 num_leading_negatives, num_trailing_negatives;
+  FindNumLeadingAndTrailingNegatives(indexes,
+                                    &num_leading_negatives,
+                                    &num_trailing_negatives);
+  if (num_leading_negatives == 0 && num_trailing_negatives == 0)
+    return false;
+
+  int32 new_num_rows = static_cast<int32>(indexes.size()) -
+      num_leading_negatives - num_trailing_negatives;
+  KALDI_ASSERT(new_num_rows > 0);
+  std::vector<int32> new_indexes(indexes.begin() + num_leading_negatives,
+                                 indexes.begin() + num_leading_negatives +
+                                 new_num_rows);
+  c.arg3 = computation->indexes.size();
+  computation->indexes.push_back(std::vector<int32>());
+  computation->indexes.back().swap(new_indexes);
+  c.arg1 = computation->NewSubMatrix(c.arg1,
+                                     num_leading_negatives, new_num_rows,
+                                     0, -1);
+  return true;  // made a change.
+}
+
+
+
+/*
+  This function, used in SnipSingleRowOp(), finds the number of leading, and
+  trailing, negative values in a vector of pairs of integers.  In particular,
+  it finds the number of leading and trailing pairs whose .first value is negative
+  (in practice we'll only encounter either (-1,-1) pairs, or pairs of both
+  nonnegative values).
+
+  For instance, if vec is
+    [ (-1,-1) (-1,-1) (80,2) (81,3) (-1,-1) (80,4) (81,5) (-1,-1) ]
+  then '*num_leading_negatives' will be set to 2 and '*num_trailing_negatives'
+  will be set to 1.  If all the .first numbers in 'vec' are all negative, or
+  'vec' is empty, it is an error and this function will invoke KALDI_ERR.
+*/
+static void FindNumLeadingAndTrailingNegatives(
+    const std::vector<std::pair<int32, int32> > &vec,
+    int32 *num_leading_negatives,
+    int32 *num_trailing_negatives) {
+  KALDI_ASSERT(!vec.empty());
+  const std::pair<int32, int32> *begin = &(vec[0]), *ptr = begin,
+      *end = ptr + vec.size();
+  while (ptr != end && ptr->first < 0)
+    ptr++;
+  // note regarding error message: we assume all negative numbers are -1, due to
+  // the way this is called, but it only affects how we describe the error.
+  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
+  *num_leading_negatives = ptr - begin;
+  const std::pair<int32, int32> *ptr2 = end - 1;
+  // the following while loop should terminate before falling off the vector,
+  // because we've established above (in the assertion) that the vector contains
+  // at least one nonnegative number.
+  while (ptr2->first < 0)
+    ptr2--;
+  KALDI_ASSERT(ptr2 != begin);  // would be code error.
+  *num_trailing_negatives = end - 1 - ptr2;
+}
+
+
+// This function, called from SnipRowOps, is called when it encounters commands
+// of type kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti or kCopyToRowsMulti;
+// have leading or trailing (-1,-1) pairs, to make them operate on a smaller
+// submatrix.  It returns true if it made a change, and false otherwise.
+static bool SnipMultiRowOp(NnetComputation *computation,
+                           int32 command_index) {
+  NnetComputation::Command &c = computation->commands[command_index];
+  KALDI_ASSERT(static_cast<size_t>(c.arg2) < computation->indexes_multi.size());
+  const std::vector<std::pair<int32, int32> > &indexes_multi =
+      computation->indexes_multi[c.arg2];
+  int32 num_leading_negatives, num_trailing_negatives;
+  FindNumLeadingAndTrailingNegatives(indexes_multi,
+                                    &num_leading_negatives,
+                                    &num_trailing_negatives);
+  if (num_leading_negatives == 0 && num_trailing_negatives == 0)
+    return false;
+
+  int32 new_num_rows = static_cast<int32>(indexes_multi.size()) -
+      num_leading_negatives - num_trailing_negatives;
+  KALDI_ASSERT(new_num_rows > 0);
+  std::vector<std::pair<int32, int32> > new_indexes_multi(
+      indexes_multi.begin() + num_leading_negatives,
+      indexes_multi.begin() + num_leading_negatives + new_num_rows);
+  c.arg2 = computation->indexes_multi.size();
+  computation->indexes_multi.push_back(std::vector<std::pair<int32, int32> >());
+  computation->indexes_multi.back().swap(new_indexes_multi);
+  c.arg1 = computation->NewSubMatrix(c.arg1,
+                                     num_leading_negatives, new_num_rows,
+                                     0, -1);
+  return true;  // made a change.
+}
+
+
+
+/*
+  This function, used in SnipRangeRowOp(), finds the number of leading, and
+  trailing values in a vector of pairs of integers, that are the same (i.e.
+  pairs of the form (x, x) for any x.  [This is how we represent an empty
+  range, which is a kind of no-op, in commands of kCopyRowRanges or
+  kAddRowRanges.
+
+  For instance, if vec is
+    [ (0,0) (0,0) (4,5) (6,8) (0,0) (10,12) (14,20) (0,0) ]
+  then '*num_leading_identicals' will be set to 2 and '*num_trailing_identicals'
+  will be set to 1.  If all pairs in 'vec' are identical, or 'vec' is empty, it
+  is an error and this function will invoke KALDI_ERR.
+*/
+static void FindNumLeadingAndTrailingIdenticals(
+    const std::vector<std::pair<int32, int32> > &vec,
+    int32 *num_leading_identicals,
+    int32 *num_trailing_identicals) {
+  KALDI_ASSERT(!vec.empty());
+  const std::pair<int32, int32> *begin = &(vec[0]), *ptr = begin,
+      *end = ptr + vec.size();
+  while (ptr != end && ptr->first == ptr->second)
+    ptr++;
+  // note regarding error message: we assume all negative numbers are -1, due to
+  // the way this is called, but it only affects how we describe the error.
+  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
+  *num_leading_identicals = ptr - begin;
+  const std::pair<int32, int32> *ptr2 = end - 1;
+  // the following while loop should terminate before falling off the vector,
+  // because we've established above (in the assertion) that the vector contains
+  // at least one nonnegative number.
+  while (ptr2->first == ptr2->second)
+    ptr2--;
+  KALDI_ASSERT(ptr2 != begin);  // would be code error.
+  *num_trailing_identicals = end - 1 - ptr2;
+}
+
+
+// This function, called from SnipRowOps, is called when it encounters commands
+// of type kAddRowRanges that have leading or trailing (x, x) pairs [i.e. pairs
+// of identical values; these are how we represent empty ranges], to make them
+// operate on a smaller submatrix.  It returns true if it made a change, and
+// false otherwise.
+static bool SnipRangesRowOp(NnetComputation *computation,
+                            int32 command_index) {
+  NnetComputation::Command &c = computation->commands[command_index];
+  KALDI_ASSERT(static_cast<size_t>(c.arg3) < computation->indexes_ranges.size());
+  const std::vector<std::pair<int32, int32> > &indexes_ranges =
+      computation->indexes_ranges[c.arg3];
+  int32 num_leading_identicals, num_trailing_identicals;
+  FindNumLeadingAndTrailingIdenticals(indexes_ranges,
+                                    &num_leading_identicals,
+                                    &num_trailing_identicals);
+  if (num_leading_identicals == 0 && num_trailing_identicals == 0)
+    return false;
+
+  int32 new_num_rows = static_cast<int32>(indexes_ranges.size()) -
+      num_leading_identicals - num_trailing_identicals;
+  KALDI_ASSERT(new_num_rows > 0);
+  std::vector<std::pair<int32, int32> > new_indexes_ranges(
+      indexes_ranges.begin() + num_leading_identicals,
+      indexes_ranges.begin() + num_leading_identicals + new_num_rows);
+  c.arg3 = computation->indexes_ranges.size();
+  computation->indexes_ranges.push_back(std::vector<std::pair<int32, int32> >());
+  computation->indexes_ranges.back().swap(new_indexes_ranges);
+  c.arg1 = computation->NewSubMatrix(c.arg1,
+                                     num_leading_identicals, new_num_rows,
+                                     0, -1);
+  return true;  // made a change.
+}
+
+
+
+bool SnipRowOps(NnetComputation *computation) {
+  bool ans = false;
+  int32 num_commands = computation->commands.size();
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    // non-const because we'll be changing it.
+    NnetComputation::Command &c = computation->commands[command_index];
+
+    switch (c.command_type) {
+      case kCopyRows: case kAddRows: {
+        if (SnipSingleRowOp(computation, command_index))
+          ans = true;
+        break;
+      }
+      case kAddRowsMulti: case kAddToRowsMulti:
+      case kCopyRowsMulti: case kCopyToRowsMulti: {
+        if (SnipMultiRowOp(computation, command_index))
+          ans = true;
+        break;
+      }
+      case kAddRowRanges: {
+        if (SnipRangesRowOp(computation, command_index))
+          ans = true;
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  return ans;
+}
+
+
+
+// This class implements the internals of the ExpandComputation() function (used
+// in shortcut compilation); see comment by the declaration of
+// ExpandComputation() in nnet-optimize-utils.h for overview.
+class ComputationExpander {
+ public:
+  ComputationExpander(const Nnet &nnet,
+                      const MiscComputationInfo &misc_info,
+                      const NnetComputation &computation,
+                      bool need_debug_info,
+                      int32 num_n_values,
+                      NnetComputation *expanded_computation):
+      nnet_(nnet), misc_info_(misc_info),
+      computation_(computation),
+      need_debug_info_(need_debug_info),
+      num_n_values_(num_n_values),
+      expanded_computation_(expanded_computation) {
+    KALDI_ASSERT(num_n_values > 2);
+  }
+
+  // This function call implements the functionality of the class,
+  // expanding the computation.
+  void Expand();
+
+ private:
+  // This function sets up and computes the 'n_fast_' vector (see comment
+  // by the declaration of 'n_fast_' for what this is.
+  void InitFastInfo();
+
+  // This function sets up the 'matrices' vector in 'expanded_computation_'.
+  // It's quite simple: it just multiplies all the num-rows by num_n_values_ and
+  // divides by 2, and leaves the num-cols the same.
+  void ComputeMatrixInfo();
+
+  // This function, only called if need_debug_info_ is true, sets up
+  // the 'matrix_debug_info' vector in 'expanded_computation_'.
+  void ComputeDebugInfo();
+
+  // This function sets up the 'submatrices' vector in 'expanded_computation_'.
+  // Column ranges always stay the same, but for row ranges it's a little
+  // more complicated.
+  void ComputeSubmatrixInfo();
+
+  // Expands a command of type kCopyRows or kAddRows; involves adding a new
+  // element of 'indexes' to expanded_computation_.
+  void ExpandRowsCommand(const NnetComputation::Command &c_in,
+                         NnetComputation::Command *c_out);
+
+  // Expands a command of type kCopyRowsMulti or kAddRowsMulti, kCopyToRowsMulti
+  // or kAddToRowsMulti; involves adding a new element of 'indexes_multi' to
+  // expanded_computation_.
+  void ExpandRowsMultiCommand(const NnetComputation::Command &c_in,
+                              NnetComputation::Command *c_out);
+
+
+  // Expands a command of type kAddRowRanges; involves adding a new element of
+  // 'indexes_ranges' to expanded_computation_.
+  void ExpandRowRangesCommand(const NnetComputation::Command &c_in,
+                              NnetComputation::Command *c_out);
+
+
+  // This function computes all the PrecomputedIndexes in the
+  // 'component_precomputed_indexes' member of 'expanded_computation_'.
+  // They are all generated from scratch, by using the Component::PrecomputedIndexes()
+  // member function.  The 'input_indexes' and 'output_indexes' arguments are worked
+  // out from the 'debug_info' [if we're not generating debug_info we specially generate
+  // it for the specific matrices in question], and the 'need_backprop'
+  // argument is worked out by seeing whether there is a call to Backprop() with
+  // the same precomputed-indexes element.
+  void ComputePrecomputedIndexes();
+
+  // Computes the 'commands' member of the output.  This function also adds as
+  // needed to 'indexes', 'indexes_multi' and 'indexes_ranges' in the output.
+  // Later on we can call RenumberComputation() to remove any duplicates that
+  // might result from this.
+  void ComputeCommands();
+
+
+  // This command ensure that the debug-info in expanded_computation_ for the
+  // matrix underlying the submatrix with index 'submatrix_index', exists and is
+  // set up.  In some cases we need the debug info for some matrices in order to
+  // do the expansion, even if debug info is not requested for the output; in
+  // those cases we set it up temporarily and clear it before we finish.
+  void EnsureDebugInfoExists(int32 submatrix_index);
+
+
+
+  // This function is used in mapping row-indexes into sub-matrices from the
+  // old to the new computation.  It is mostly a wrapper for
+  // GetNewMatrixLocationInfo, but designed to give row indexes into
+  // submatrices rather than matrices; see the documentation for
+  // GetNewMatrixLocationinfo() for details and an explanation of the
+  // interface.
+  // This function assumes that ComputeSubmatrixInfo() has already
+  // been called.
+  // Note: it returns true if the index 'old_row_index' into submatrix
+  // indexed 'old_submat_index' corresponds to an Index with n=0; otherwise
+  // it returns false and does not set the output values.
+  bool GetNewSubmatLocationInfo(int32 old_submat_index,
+                                int32 old_row_index,
+                                int32 *new_row_index,
+                                int32 *new_n_stride) const;
+
+
+  /// This function is used in mapping row-indexes into matrices, from the
+  /// old to the new computation.
+  ///     @param [in] old_matrix_index   The matrix-index > 0, for which we
+  ///                                    are mapping row-indexes.
+  ///     @param [in] old_row_index   The old row-index into the matrix.
+  ///                            This MUST be a row-index for which n=0
+  ///                            in the cindexes information.
+  ///     @param [out] new_row_index  To '*new_row_index' this funtion outputs
+  ///                            the row-index where the cindex referred to in
+  ///                            'old_matrix_index' will reside in the new,
+  ///                            expanded computation.
+  ///     @param [out] new_n_stride   To '*new_n_stride' this function outputs
+  ///                            the 'n stride' in the new computation, which
+  ///                            means the amount the row-index increases
+  ///                            every time we increase the 'n' value in the
+  ///                            cindex by one.
+  void GetNewMatrixLocationInfo(int32 old_matrix_index,
+                                int32 old_row_index,
+                                int32 *new_row_index,
+                                int32 *new_n_stride) const;
+
+
+
+  // This function 'expands' a set of indexes; it's called from
+  // ComputePrecomputedIndexes().  The indexes are expected to
+  // have the normal kind of regularity, with the 'n' varying either
+  // the fastest or the slowest of any index.
+  void ExpandIndexes(const std::vector<Index> &indexes,
+                     std::vector<Index> *indexes_expanded) const;
+
+
+
+  // This function, used in ExpandIndexes(), works out whether a vector
+  // of indexes varies 'fast' in n, or slowly; see the comment for
+  // ComputationIsDecomposable() in nnet-optimize-utils.h for more explanation
+  // of the meaning.
+  // If the vector of indexes does not have the required regular structure w.r.t
+  // n, this function will throw an exception via KALDI_ERR.
+  bool GetFastInfo(const std::vector<Index> &indexes) const;
+
+  /// This function is analogous to GetNewMatrixLocationInfo, but
+  /// specialized for the case where you have a vector of Indexes
+  /// It's used inside ExpandIndexes().
+  ///
+  ///  @param [in] 'is_fast' should be true if the 'n' varies fast in the input
+  ///               indexes (i.e. n stride is 1)...
+  ///  @param [in] old_index The index into 'indexes'.. should point to an
+  ///                         element with n==0 (note, the element is an Index;
+  ///                         and note the capital I, it affects the meaning).
+  ///  @param [out] new_index  The index into the expanded indexes vector
+  ///                         that this same Index will be located at in the
+  ///                         expanded computation.
+  ///  @param [out] new_n_stride  The stride of n, i.e. the amount by which the
+  ///                          index changes when we increment n by one in the
+  ///                          Index.  This will actually be the same as in
+  ///                          the old computation.
+  void GetNewLocationInfo(const std::vector<Index> &indexes,
+                          bool is_fast,
+                          int32 old_index,
+                          int32 *new_index,
+                          int32 *new_n_stride) const;
+
+
+  // This 'n_fast_' vector is indexed by the matrix-index in the computation,
+  // i.e. the same index as indexes computation_.matrix_info and
+  // expanded_computation_->matrix_info.  For each matrix-index m > 0 it
+  // contains true if the 'n' varies 'fast', or false if the 'n' index varies
+  // 'slowly'.  By 'fast' and 'slow', we mean in the same sense as is desribed
+  // in the comment for ComputationIsDecomposable() in nnet-optimize-utils.h.
+  std::vector<bool> n_fast_;
+
+  const Nnet &nnet_;
+  const MiscComputationInfo &misc_info_;
+  const NnetComputation &computation_;
+  bool need_debug_info_;
+  int32 num_n_values_;
+  NnetComputation *expanded_computation_;
+};
+
+
+
+void ComputationExpander::ExpandRowsCommand(
+    const NnetComputation::Command &c_in,
+    NnetComputation::Command *c_out) {
+  // we need to expand the row-indexes in c_in.arg3, and put the index of the
+  // resulting vector<int> in expanded_computation_->indexes, in 'c_out->arg3'.
+
+  int32 s1 = c_in.arg1, s2 = c_in.arg2;
+
+  // The command that gets called is something like
+  // submat1.AddRows(submat2, indexes) if submat1 is the submatrix referred to in
+  // 's1' and submat2 is the submatrix referred to in 's2'.
+  // 'indexes' has the same size as the num-rows of submat1, and the values
+  // in the vector are row-indexes into s2.
+  const std::vector<int32> &old_indexes = computation_.indexes[c_in.arg3];
+  c_out->arg3 = expanded_computation_->indexes.size();
+  expanded_computation_->indexes.push_back(std::vector<int32>());
+  std::vector<int32> &new_indexes = expanded_computation_->indexes.back();
+
+  int32 old_size = old_indexes.size(),
+      num_n_values = num_n_values_,
+      new_size = expanded_computation_->submatrices[s1].num_rows;
+  KALDI_ASSERT(old_size % 2 == 0 &&
+               old_size == computation_.submatrices[s1].num_rows);
+  new_indexes.resize(new_size, -1);
+
+  for (int32 i1 = 0; i1 < old_size; i1++) {
+    int32 new_i1_n0, new_n_stride1;
+    if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &new_n_stride1)) {
+      // GetNewSubmatLocationInfo() returns true if this corresponds to
+      // a Cindex with n == 0.
+      int32 i2 = old_indexes[i1];
+      int32 new_i2_n0, new_n_stride2;
+      if (i2 < 0) {  // if i2 is -1, we'll just fill any relevant positions in
+                     // 'new_indexes' with -1's.
+        continue;
+      } else {
+        bool ans = GetNewSubmatLocationInfo(s2, i2, &new_i2_n0, &new_n_stride2);
+        KALDI_ASSERT(ans);  // source should also be for n==0, because we don't
+                            // (or at least shouldn't) create computations that
+                            // mix up the 'n' values
+        for (int32 n = 0; n < num_n_values; n++) {
+          int32 new_i1 = new_i1_n0 + n * new_n_stride1,
+              new_i2 = new_i2_n0 + new_n_stride2;
+          new_indexes[new_i1] = new_i2;
+        }
+      }
+    }
+  }
+}
+
+void ComputationExpander::ExpandRowsMultiCommand(
+    const NnetComputation::Command &c_in,
+    NnetComputation::Command *c_out) {
+  // we need to expand the (submatrix,row)-index pairs in c_in.arg2, and put the
+  // index of the resulting vector<int> in expanded_computation_->indexes_multi,
+  // in 'c_out->arg2'.
+
+  int32 s1 = c_in.arg1,
+      num_rows_old = computation_.submatrices[s1].num_rows,
+      num_rows_new = expanded_computation_->submatrices[s1].num_rows;
+
+  const std::vector<std::pair<int32, int32> > &old_indexes_multi =
+      computation_.indexes_multi[c_in.arg2];
+  // old_indexes_multi is a vector that has the same size as the num-rows
+  // of submatrix s1.  It contains pairs that are either (-1, -1), or
+  // pairs (submatrix-index, row-index) referring to other submatrices
+  // in the computation.
+
+  KALDI_ASSERT(static_cast<int32>(old_indexes_multi.size()) == num_rows_old);
+  KALDI_ASSERT(num_rows_old % 2 == 0);
+  int32 num_n_values = num_n_values_;
+
+
+  c_out->arg2 = expanded_computation_->indexes_multi.size();
+  expanded_computation_->indexes_multi.push_back(
+      std::vector<std::pair<int32, int32> >());
+  std::vector<std::pair<int32, int32> > &new_indexes_multi =
+      expanded_computation_->indexes_multi.back();
+
+  new_indexes_multi.resize(num_rows_new,
+                           std::pair<int32,int32>(-1, -1));
+
+  for (int32 i1 = 0; i1 < num_rows_old; i1++) {
+    int32 new_i1_n0, new_n_stride1;
+    if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &new_n_stride1)) {
+      // GetNewSubmatLocationInfo() returns true if this corresponds to
+      // a Cindex with n == 0.
+      int32 s2 = old_indexes_multi[i1].first,
+          i2 = old_indexes_multi[i1].second;
+      int32 new_i2_n0, new_n_stride2;
+      if (s2 < 0) {  // if s2 is -1, we don't have to do anything... we'd have
+                     // to fill any relevant positions in 'new_indexes_multi'
+                     // with (-1,-1)'s, but it's filled with that by default.
+        continue;
+      } else {
+        bool ans = GetNewSubmatLocationInfo(s2, i2, &new_i2_n0, &new_n_stride2);
+        KALDI_ASSERT(ans);  // source should also be for n==0, because we don't
+                            // (or at least shouldn't) create computations that
+                            // mix up the 'n' values
+
+        int32 new_i1 = new_i1_n0, new_i2 = new_i2_n0;
+
+        for (int32 n = 0; n < num_n_values;
+             n++, new_i1 += new_n_stride1, new_i2 += new_n_stride2) {
+          new_indexes_multi[new_i1].first = s2;
+          new_indexes_multi[new_i1].second = new_i2;
+        }
+      }
+    }
+  }
+}
+
+
+
+void ComputationExpander::ExpandRowRangesCommand(
+    const NnetComputation::Command &c_in,
+    NnetComputation::Command *c_out) {
+  // we need to expand the pairs of row-indexes in c_in.arg2, and put the index
+  // of the resulting vector<int> in expanded_computation_->indexes_ranges, in
+  // 'c_out->arg2'.
+
+  int32 s1 = c_in.arg1, s2 = c_in.arg2,
+      num_rows_old = computation_.submatrices[s1].num_rows,
+      num_rows_new = expanded_computation_->submatrices[s1].num_rows;
+  KALDI_ASSERT(static_cast<size_t>(c_in.arg3) <
+               computation_.indexes_ranges.size());
+  const std::vector<std::pair<int32, int32> > &old_indexes_ranges =
+      computation_.indexes_ranges[c_in.arg3];
+  // old_indexes_ranges is a vector that has the same size as the num-rows of
+  // submatrix s1.  It contains pairs that are either two copies of the same
+  // value (in practice the pair (-1, -1)), or pairs (begin-row-index,
+  // end-row-index) representing the (begin,end) of a range in submatrix s2.
+  // Note: end-row-index is one past the end of the range, as for C++ iterators.
+
+  KALDI_ASSERT(static_cast<int32>(old_indexes_ranges.size()) == num_rows_old);
+  KALDI_ASSERT(num_rows_old % 2 == 0);
+  int32 num_n_values = num_n_values_;
+
+  c_out->arg3 = expanded_computation_->indexes_ranges.size();
+  expanded_computation_->indexes_ranges.push_back(
+      std::vector<std::pair<int32, int32> >());
+  std::vector<std::pair<int32, int32> > &new_indexes_ranges =
+      expanded_computation_->indexes_ranges.back();
+
+  new_indexes_ranges.resize(num_rows_new,
+                           std::pair<int32,int32>(-1, -1));
+
+  for (int32 i1 = 0; i1 < num_rows_old; i1++) {
+    int32 new_i1_n0, new_n_stride1;
+    if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &new_n_stride1)) {
+      // GetNewSubmatLocationInfo() returns true if this corresponds to
+      // a Cindex with n == 0.
+      int32 i2_begin = old_indexes_ranges[i1].first,
+          i2_end = old_indexes_ranges[i1].second;
+      if (i2_end == i2_begin)
+        continue;  // (-1, -1) pair, meaning an empty range.
+                   // 'new_indexes_ranges' is filled with (-1, -1) pairs as a
+                   // default so we don't have to do anything for these
+                   // elements.
+      int32 i2_last = i2_end - 1;
+      int32 new_i2_n0_begin, new_i2_n0_last,
+          new_n_stride2;  // only 1 stride variable; both calls will output
+                          // the same value.
+
+      bool ans1 = GetNewSubmatLocationInfo(s2, i2_begin, &new_i2_n0_begin,
+                                           &new_n_stride2),
+          ans2 = GetNewSubmatLocationInfo(s2, i2_last, &new_i2_n0_last,
+                                          &new_n_stride2);
+      KALDI_ASSERT(ans1 && ans2 && new_i2_n0_last >= new_i2_n0_begin &&
+                   new_i2_n0_begin >= 0);
+      // source should also be for n==0, because we don't (or at least
+      // shouldn't) create computations that mix up the 'n' values
+
+
+      int32 new_i1 = new_i1_n0,
+          new_i2_begin = new_i2_n0_begin,
+          new_i2_end = new_i2_n0_last + 1;
+      for (int32 n = 0; n < num_n_values;
+           n++, new_i1 += new_n_stride1, new_i2_begin += new_n_stride2,
+               new_i2_end += new_n_stride2) {
+        new_indexes_ranges[new_i1].first = new_i2_begin;
+        new_indexes_ranges[new_i1].second = new_i2_end;
+      }
+    }
+  }
+}
+
+
+
+void ComputationExpander::ComputeCommands() {
+  int32 num_commands = computation_.commands.size();
+  expanded_computation_->commands.resize(num_commands);
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    const NnetComputation::Command &c = computation_.commands[command_index];
+    NnetComputation::Command &c_out =
+        expanded_computation_->commands[command_index];
+    c_out = c;
+    // Commands that only operate on submatrices, components and
+    // precomputed-indexes do not have to be changed because we'll take care of
+    // the expansion by suitably redefining the matrices and submatrices, and
+    // recreating the precomputed-indexes.
+    // However, commands that require, 'indexes', 'indexes_multi' or
+    // 'indexes_ranges' do need to be modified.
+    switch (c.command_type) {
+      case kAllocMatrixUndefined: case kAllocMatrixZeroed:
+      case kDeallocMatrix: case kAllocMatrixFromOther:
+      case kAllocMatrixFromOtherZeroed:
+      case kPropagate: case kStoreStats: case kBackprop:
+      case kBackpropNoModelUpdate: case kMatrixCopy: case kMatrixAdd:
+        break;
+      case kCopyRows: case kAddRows:
+        ExpandRowsCommand(c, &c_out);
+        break;
+      case kCopyRowsMulti: case kAddRowsMulti:
+      case kCopyToRowsMulti: case kAddToRowsMulti:
+        ExpandRowsMultiCommand(c, &c_out);
+        break;
+      case kAddRowRanges:
+        ExpandRowRangesCommand(c, &c_out);
+        break;
+      case kAcceptInput: case kProvideOutput: case kNoOperation:
+      case kNoOperationMarker: case kNoOperationLabel: case kGotoLabel:
+        break;
+      default:
+        KALDI_ERR << "Un-handled command type";
+    }
+  }
+}
+
+
+
+
+void ComputationExpander::InitFastInfo() {
+  // note: the zeroth matrix is not a real matrix, it's the empty matrix.
+  int32 num_matrices = computation_.matrices.size();
+  n_fast_.resize(num_matrices);
+
+  // the input computation to class ComputationExpander is required to
+  // have its debug info set up.
+  KALDI_ASSERT(!computation_.matrix_debug_info.empty());
+  for (int32 m = 1; m < num_matrices; m++) {
+    int32 num_rows = computation_.matrices[m].num_rows;
+    // num-rows should be a multiple of 2 because we assume the input computation
+    // was built for 2 n-values, and has a symmetry where it's doing the same
+    // computation for each n values.
+    KALDI_ASSERT(num_rows % 2 == 0);
+    const NnetComputation::MatrixDebugInfo &debug_info = computation_.matrix_debug_info[m];
+    KALDI_ASSERT(debug_info.cindexes.size() == num_rows);
+    // We require that the 'n' values be in order, which implies that the first
+    // 'n' value be zero.
+    KALDI_ASSERT(debug_info.cindexes[0].second.n == 0);
+    bool is_fast = (debug_info.cindexes[1].second.n == 1);
+
+    bool do_check = (RandInt(0, 2) == 0);
+    if (do_check) {
+      // n_stride is the expected difference in row-index between successive
+      // values of 'n' for otherwise identical cindexes.
+      int32 n_stride = (is_fast ? 1 : num_rows / 2);
+      // 'increment' would be 1 if we were checking everything; we do a partial
+      // check, for speed.
+      int32 increment = RandInt(1, 10);
+      for (int32 i = 0; i + n_stride < num_rows; i += increment) {
+        const Cindex &this_cindex = debug_info.cindexes[i],
+            &next_cindex = debug_info.cindexes[i + n_stride];
+        if (this_cindex.second.n == 0) {
+          if (!(next_cindex.first == this_cindex.first &&
+                next_cindex.second.n == 1 &&
+                next_cindex.second.t == this_cindex.second.t &&
+                next_cindex.second.x == this_cindex.second.x)) {
+            KALDI_ERR << "Problem encountered in 'shortcut' compilation: the computation "
+                      << "does not have the expected structure.  Try compiling with "
+                      << "--use-shortcut=false.";
+          }
+        }
+      }
+    }
+  }
+}
+
+
+bool ComputationExpander::GetFastInfo(const std::vector<Index> &indexes) const {
+  KALDI_ASSERT(!indexes.empty());
+  int32 num_rows = indexes.size();
+  // num-rows should be a multiple of 2 because we assume the input computation
+  // was built for 2 n-values, and has a symmetry where it's doing the same
+  // computation for each n values.
+  KALDI_ASSERT(num_rows % 2 == 0);
+
+  KALDI_ASSERT(indexes[0].n == 0);
+  bool is_fast = (indexes[1].n == 1);
+  bool do_check = (RandInt(0, 1) == 0);
+
+  if (do_check) {
+    // n_stride is the expected difference in row-index between successive
+    // values of 'n' for otherwise identical cindexes.
+    int32 n_stride = (is_fast ? 1 : num_rows / 2);
+    // 'increment' would be 1 if we were checking everything; we do a partial
+    // check, for speed.
+    int32 increment = RandInt(1, 5);
+    for (int32 i = 0; i + n_stride < num_rows; i += increment) {
+      const Index &this_index = indexes[i], &next_index = indexes[i + n_stride];
+      if (this_index.n == 0) {
+        if (!(next_index.n == 1 && next_index.t == this_index.t &&
+              next_index.x == this_index.x)) {
+          KALDI_ERR << "Problem encountered in 'shortcut' compilation: the computation "
+                    << "does not have the expected structure.  Try compiling with "
+                    << "--use-shortcut=false.";
+        }
+      }
+    }
+  }
+  return is_fast;
+}
+
+
+void ComputationExpander::Expand() {
+  InitFastInfo();
+  ComputeMatrixInfo();
+  if (need_debug_info_)
+    ComputeDebugInfo();
+  else
+    expanded_computation_->matrix_debug_info.clear();
+  ComputeSubmatrixInfo();
+  ComputePrecomputedIndexes();
+  ComputeCommands();
+
+  expanded_computation_->need_model_derivative =
+      computation_.need_model_derivative;
+}
+
+void ComputationExpander::ComputeMatrixInfo() {
+  int32 num_matrices = computation_.matrices.size();
+  expanded_computation_->matrices.resize(num_matrices);
+  // Matrix zero is a special case; it's the empty matrix.
+  expanded_computation_->matrices[0] = computation_.matrices[0];
+  for (int32 m = 1; m < num_matrices; m++) {
+    expanded_computation_->matrices[m] = computation_.matrices[m];
+    expanded_computation_->matrices[m].num_rows =
+        (computation_.matrices[m].num_rows / 2) * num_n_values_;
+  }
+}
+
+void ComputationExpander::ComputeDebugInfo() {
+  int32 num_matrices = computation_.matrices.size();
+  KALDI_ASSERT(computation_.matrix_debug_info.size() == num_matrices);
+  expanded_computation_->matrix_debug_info.resize(num_matrices);
+  // Matrix zero is a special case; it's the empty matrix.
+  expanded_computation_->matrix_debug_info[0] =
+      computation_.matrix_debug_info[0];
+  int32 num_n_values = num_n_values_;
+  for (int32 m = 1; m < num_matrices; m++) {
+    const NnetComputation::MatrixDebugInfo &info_in =
+        computation_.matrix_debug_info[m];
+    NnetComputation::MatrixDebugInfo &info_out =
+        expanded_computation_->matrix_debug_info[m];
+    info_out.is_deriv = info_in.is_deriv;
+    int32 num_rows_in = computation_.matrices[m].num_rows,
+        num_rows_out = expanded_computation_->matrices[m].num_rows;
+    KALDI_ASSERT(num_rows_in == info_in.cindexes.size());
+    info_out.cindexes.resize(num_rows_out);
+    const Cindex *cindexes_in = &(info_in.cindexes[0]);
+    Cindex *cindexes_out = &(info_out.cindexes[0]);
+    for (int32 r = 0; r < num_rows_in; r++) {
+      if (info_in.cindexes[r].second.n == 0) {
+        int32 new_r, new_n_stride;
+        GetNewMatrixLocationInfo(m, r, &new_r, &new_n_stride);
+        for (int32 n = 0; n < num_n_values; n++) {
+          int32 r_out = new_r + n * new_n_stride;
+          cindexes_out[r_out] = cindexes_in[r];
+          cindexes_out[r_out].second.n = n;
+        }
+      }
+    }
+  }
+}
+
+void ComputationExpander::ComputeSubmatrixInfo() {
+  int32 num_submatrices = computation_.submatrices.size();
+  expanded_computation_->submatrices.resize(num_submatrices);
+  // Sub-matrix zero is a special case; it's the empty submatrix.
+  expanded_computation_->submatrices[0] = computation_.submatrices[0];
+  for (int32 s = 1; s < num_submatrices; s++) {
+    const NnetComputation::SubMatrixInfo &info_in = computation_.submatrices[s];
+    int32 m = info_in.matrix_index;
+    const NnetComputation::MatrixDebugInfo &debug_info_in =
+        computation_.matrix_debug_info[m];
+
+
+    int32 old_n_stride =
+        (n_fast_[m] ? 1 : computation_.matrices[m].num_rows / 2);
+
+     // we may need to change the row_offset and num_rows.
+     int32 first_row_in = info_in.row_offset,
+         last_row_in = first_row_in + info_in.num_rows - 1,
+         last_row_in_n0 = last_row_in - old_n_stride;
+     KALDI_ASSERT(debug_info_in.cindexes[first_row_in].second.n == 0 &&
+                  debug_info_in.cindexes[last_row_in].second.n == 1 &&
+                  debug_info_in.cindexes[last_row_in_n0].second.n == 0);
+     // the function GetNewMatrixLocationInfo() only works for rows that
+     // correspond to n == 0, so we work out a location that's otherwise similar
+     // to the last row but has n == 0, get the 'new' location for that, and
+     // convert to n == (num_n_values_ - 1).
+     int32 first_row_out, last_row_out_n0, new_n_stride;
+     GetNewMatrixLocationInfo(m, first_row_in,
+                              &first_row_out, &new_n_stride);
+     GetNewMatrixLocationInfo(m, last_row_in_n0,
+                              &last_row_out_n0, &new_n_stride);
+     int32 last_row_out = last_row_out_n0 + (new_n_stride * (num_n_values_ - 1)),
+         new_num_rows = (last_row_out + 1 - first_row_out);
+     KALDI_ASSERT(new_num_rows >= info_in.num_rows);
+
+    NnetComputation::SubMatrixInfo &info_out =
+        expanded_computation_->submatrices[s];
+    info_out.matrix_index = m;
+    info_out.row_offset = first_row_out;
+    info_out.num_rows = new_num_rows;
+    info_out.col_offset = info_in.col_offset;
+    info_out.num_cols = info_in.num_cols;
+  }
+}
+
+void ComputationExpander::ComputePrecomputedIndexes() {
+  // for each element of 'component_precomputed_indexes',
+  // we will try to work out the command-index of the associated
+  // Propagate() command and of the associated Backprop() command,
+  // if it exists.
+  // We expect that each such element will be associated with
+  // exactly one Propagate() command and at most one Backprop() command.
+  int32 num_commands = computation_.commands.size(),
+    num_precomputed_indexes = computation_.component_precomputed_indexes.size();
+
+  if (num_precomputed_indexes == 1)
+    return;  // Nothing to compute.  Note: element zero of
+             // component_precomputed_indexes is reserved for NULL.
+
+  std::vector<bool> need_backprop(num_precomputed_indexes, false);
+
+  std::vector<int32> component_index(num_precomputed_indexes, -1);
+
+  for (int32 command_index = 0; command_index < num_commands; command_index++) {
+    const NnetComputation::Command &c = computation_.commands[command_index];
+
+    if (c.command_type == kPropagate && c.arg2 > 0) {
+      KALDI_ASSERT(c.arg2 < num_precomputed_indexes);
+      component_index[c.arg2] = c.arg1;
+    }
+    if ((c.command_type == kBackprop ||
+         c.command_type == kBackpropNoModelUpdate) && c.arg2 > 0) {
+      KALDI_ASSERT(c.arg2 < num_precomputed_indexes);
+      need_backprop[c.arg2] = true;
+    }
+  }
+
+  for (size_t p = 1;
+       p < expanded_computation_->component_precomputed_indexes.size();
+       ++p)
+    delete expanded_computation_->component_precomputed_indexes[p].data;
+  expanded_computation_->component_precomputed_indexes.clear();
+  expanded_computation_->component_precomputed_indexes.resize(
+      num_precomputed_indexes);
+
+  for (int32 p = 1; p < num_precomputed_indexes; ++p) {
+    const NnetComputation::PrecomputedIndexesInfo &old_info =
+        computation_.component_precomputed_indexes[p];
+    NnetComputation::PrecomputedIndexesInfo &new_info =
+        expanded_computation_->component_precomputed_indexes[p];
+    KALDI_ASSERT(!old_info.input_indexes.empty() &&
+                 !old_info.output_indexes.empty() &&
+                 "Input/output indexes not present in precomputed info of "
+                 "computation to be expanded.");
+    // note: we could place these expanded indexes into 'new_info.input_indexes'
+    // and 'new_info.output_indexes', but we actually don't need to keep them
+    // there, because they are only required to be kept in computations where
+    // the n indexes consist of the set (0, 1), and the computation we're
+    // creating has more distinct n indexes than that.
+    std::vector<Index> input_indexes, output_indexes;
+    ExpandIndexes(old_info.input_indexes, &new_info.input_indexes);
+    ExpandIndexes(old_info.output_indexes, &new_info.output_indexes);
+    KALDI_ASSERT(component_index[p] >= 0);
+    const Component *component = nnet_.GetComponent(component_index[p]);
+    ComponentPrecomputedIndexes *expanded_precomputed_indexes =
+        component->PrecomputeIndexes(misc_info_, input_indexes,
+                                     output_indexes, need_backprop[p]);
+    // this object should not be null because it was not NULL the
+    // last time we generated it from the same component, for the
+    // same computation.
+    KALDI_ASSERT(expanded_precomputed_indexes != NULL);
+    new_info.data = expanded_precomputed_indexes;
+  }
+}
+
+
+bool ComputationExpander::GetNewSubmatLocationInfo(
+    int32 old_submat_index, int32 old_row_index,
+    int32 *new_row_index, int32 *new_n_stride) const {
+  int32 matrix_index = computation_.submatrices[old_submat_index].matrix_index,
+      row_offset = computation_.submatrices[old_submat_index].row_offset;
+
+  const NnetComputation::MatrixDebugInfo &debug_info_in =
+      computation_.matrix_debug_info[matrix_index];
+  if (debug_info_in.cindexes[old_row_index + row_offset].second.n != 0)
+    return false;
+  GetNewMatrixLocationInfo(matrix_index, old_row_index + row_offset,
+                           new_row_index, new_n_stride);
+  *new_row_index -= row_offset;
+  return true;
+}
+
+void ComputationExpander::GetNewMatrixLocationInfo(
+    int32 old_matrix_index, int32 old_row_index,
+    int32 *new_row_index, int32 *new_n_stride) const {
+  bool n_is_fast = n_fast_[old_matrix_index];
+  int32 num_rows = computation_.matrices[old_matrix_index].num_rows;
+  int32 n_stride;
+  if (n_is_fast) {
+    n_stride = 1;
+    // If the n index varies fast for this matrix, then the old row-index
+    // should be a multiple of 2 because:
+    //  - we assume that the input computation was built for 2 n-values
+    //  - if n varies fast then the cindexes for this matrix in the input
+    //    computation would come in pairs, for n=(0,1)
+    //  - the cindex that 'old_row_index' represents must be for n=0
+    //    (this is a requirement of this function)
+    KALDI_ASSERT(old_row_index % 2 == 0);
+    *new_n_stride = 1;
+    // the row-index of the element in question with n=0 will get larger if n
+    // varies 'fast', because each block of elements with a certain (x,t) value
+    // grows in size by a factor of num_n_values_ / 2.0.
+    *new_row_index = (old_row_index / 2) * num_n_values_;
+  } else {
+    // n varies more slowly, the cindexes are in blocks where the
+    // first block has n=0, the second has n=1, and so on.
+    // Because we assume that the cindex that lives in this location
+    // has n == 0, its position does not change (so new_row_index ==
+    // old_row_index).
+    *new_row_index = old_row_index;
+    *new_n_stride = (num_rows / 2);
+  }
+}
+
+
+void ComputationExpander::ExpandIndexes(
+    const std::vector<Index> &indexes,
+    std::vector<Index> *indexes_expanded) const {
+  bool is_fast = GetFastInfo(indexes);
+  int32 num_n_values = num_n_values_,
+      old_size = indexes.size(),
+      new_size = (old_size / 2) * num_n_values;
+  indexes_expanded->resize(new_size);
+  Index *indexes_expanded_ptr = &((*indexes_expanded)[0]);
+  for (int32 i = 0; i < old_size; i++) {
+    if (indexes[i].n == 0) {
+      int32 new_i_n0, new_n_stride;
+      int32 t = indexes[i].t, x = indexes[i].x;
+      GetNewLocationInfo(indexes, is_fast, i, &new_i_n0, &new_n_stride);
+      for (int32 n = 0; n < num_n_values; n++) {
+        int32 new_i = new_i_n0 + (n * new_n_stride);
+        KALDI_ASSERT(new_i < new_size);
+        indexes_expanded_ptr[new_i].n = n;
+        indexes_expanded_ptr[new_i].t = t;
+        indexes_expanded_ptr[new_i].x = x;
+      }
+    }
+  }
+}
+
+
+void ComputationExpander::GetNewLocationInfo(
+    const std::vector<Index> &indexes, bool is_fast,
+    int32 old_index, int32 *new_index, int32 *new_n_stride) const {
+  int32 num_indexes = indexes.size();
+  KALDI_ASSERT(num_indexes > 0 && num_indexes % 2 == 0 &&
+               indexes.front().n == 0 && indexes.back().n == 1);
+  int32 n_stride;
+  if (is_fast) {
+    n_stride = 1;
+    // If the n index varies fast for this matrix, then the old row-index
+    // should be a multiple of 2 because:
+    //  - we assume that the input computation was built for 2 n-values
+    //  - if n varies fast then the cindexes for this matrix in the input
+    //    computation would come in pairs, for n=(0,1)
+    //  - the cindex that 'old_row_index' represents must be for n=0
+    //    (this is a requirement of this function)
+    KALDI_ASSERT(old_index % 2 == 0);
+    *new_n_stride = 1;
+    // the row-index of the element in question with n=0 will get larger if n
+    // varies 'fast', because each block of elements with a certain (x,t) value
+    // grows in size by a factor of num_n_values_ / 2.0.
+    *new_index = (old_index / 2) * num_n_values_;
+  } else {
+    // n varies more slowly; the Indexes are in blocks where the
+    // first block has n=0, the second has n=1, and so on.
+    // Because we assume that the cindex that lives in this location
+    // has n == 0, its position does not change (so new_row_index ==
+    // old_row_index).
+    *new_index = old_index;
+    *new_n_stride = (num_indexes / 2);
+  }
+}
+
+class ComputationLoopedOptimizer {
+ public:
+  ComputationLoopedOptimizer(const Nnet &nnet,
+                             NnetComputation *computation):
+      nnet_(nnet), computation_(computation) { }
+  bool Optimize();
+
+ private:
+
+  // Figures out the time shift between the successive computation requests.
+  static int32 FindTimeShift(const NnetComputation &computation,
+                             const std::vector<int32> &segment_ends);
+
+  // This function creates a mapping from a matrix-index > 0,
+  // to a pair (unique_id, time_offset) that represents the debug-info
+  // for that matrix-id in computation.debug_info.
+  // The output vector is indexed by the matrix-index in the computation (the
+  // zeroth member is not valid).  It requires that the
+  // The 'time_offset' is equal to the 't' value of the zeroth element of the
+  // cindexes vetor.  The 'unique_id' is an integer that uniquely identifies
+  // what we get from subtracting the 'time_offset' from each 't' value of
+  // that 'cindexes' vector, and then pairing it up with the 'is_deriv'
+  // value of the DebugInfo.  That is, if two 'cindexes' vectors differ only
+  // by a time offset, and the 'is_deriv' values are the same they will map to the same
+  // unique_id.
+  // The output 'matrix_to_pair' is indexed by matrix index (the zeroth element is
+  // not set).
+  static void CreateMatrixPairs(const NnetComputation &computation,
+                                std::vector<std::pair<int32, int32> > *matrix_to_pair);
+
+
+  // This very simple helper function reverses the map 'matrix_to_pair' so we can
+  // do the reverse lookup.  It outputs a map from pair to matrix index m, where
+  // 1 <= m < matrix_to_pair.size().
+  static void GetPairToMatrixMap(
+      std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix);
+
+
+  // Given a vector of lists, one list for each segment, of the active matrices
+  // at the end of that segment, this function converts those lists into a
+  // different representation where each matrix is reprented as a pair instead
+  // of as a single int32.  'active_pairs' will have the same dimensions as
+  // 'active_matrices'.
+  static void ConvertListsToPairLists(
+      const std::vector<std::vector<int32> > &active_matrices,
+      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs);
+
+  // This function modifies the lists of active matrices per segment
+  // (represented as pairs) in 'active_pairs' by sorting them and
+  // then subtracting the time-offset of the first pair in each
+  // list ((*active_pair)[seg][0].second), from all elements in that list.
+  // It puts the subtracted offset in (*time_offsets)[seg].  This change
+  // of representation makes it easy to tell whether the sets of active
+  // matrices for different segments are identical up to a time-offset.
+  static void NormalizePairLists(
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
+      std::vector<int32> *time_offsets);
+
+  // This function looks in the matrix 'active_pairs' for the first pair of
+  // identical values, i.e. it is looking for i < j for which
+  // normalized_active_pairs[i] == normalized_active_pairs[j].  (However, the
+  // pair i,j must satisfy an extra condition, see below).  If a pair
+  // i,j exists satisfying these conditions, this function outputs them to *seg1
+  // and *seg2, and returns true; otherwise it returns false.
+  //
+  // Extra condition:
+  // It turns out that under some circumstances, we can
+  // fine repeats that were not "really" repeats (the matrices were not time
+  // shifted) The situation was a bit obscure (it was a non-recurrent setup with
+  // a lot of extra-right-context, where some inputs were never used), but to
+  // prevent it happening again we are now checking in addition to the above,
+  // that the time-shift between the segments (i.e. time_offsets[j] -
+  // time_offsets[i]), has the "expected value" based on the assumption that
+  // each segment should be shifted relative to the previous segment, by
+  // 'time_shift_per_segment'.
+  static bool FindFirstRepeat(
+      const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
+      const std::vector<int32> &time_offsets,
+      int32 time_shift_per_segment,
+      int32 *seg1, int32 *seg2);
+
+  // Converts a list of pairs (e.g. one of the elements of the output of
+  // 'ConvertListsToPairLists)', back into a list of matrix indexes, using the
+  // map 'pair_to_matrix'.
+  static void PairListToMatrixList(
+      const std::vector<std::pair<int32, int32> > &pair_list,
+      const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
+      std::vector<int32> *matrix_list);
+
+
+  // This function just does some checking (via asserts), that
+  // the lists of matrices 'list1' and 'list2' are of the same length,
+  // that time_difference > 0, that each matrix with index m = list2[i] is of the
+  // same dimension as the list1[i], with Cindexes that are the same except for
+  // the time index being greater by 'time_difference'
+  static void CheckIdentifiedMatrices(
+      const NnetComputation &computation,
+      const std::vector<int32> &list1,
+      const std::vector<int32> &list2,
+      int32 time_difference);
+
+
+  // Given two command indexes command1 < command2 pointing to commands of type
+  // kNoOperationMarker, this function modifies the computation by
+  // removing all commands after command2, replacing command2 with a kGotoLabel
+  // command pointing to command1  and then inserting just before command1
+  // a marker of type kNoOperationLabel.
+  static void FormInfiniteLoop(int32 command1, int32 command2,
+                               NnetComputation *computation);
+
+  // This is to be called after FormInfiniteLoop.  It inserts, just before
+  // the final kGotoLabel command, commands that initialize
+  // each of the matrices in list 'matrices1' from the corresponding
+  // matrix in 'matrices2', using the kAllocMatrixFromOther command.
+  // This effectively does, for example, matrices1[i] = matrices2[i],
+  // while initializing matrices1[i] and deallocating matrices2[i];
+  // it's implemented as a shallow swap.
+  // It does this in such an order that even if the two lists are
+  // not disjoint, the right thing happens.
+  static void AddMatrixSwapCommands(
+      const std::vector<int32> &matrices1,
+      const std::vector<int32> &matrices2,
+      NnetComputation *computation);
+
+
+  // Called from AddMatrixSwapCommands, this function figures out for us
+  // an acceptable order in which to execute the kAllocMatrixFromOther
+  // commands.  This is easy to do if matrices1 and matrices2 are disjoint
+  // sets, but has to be done more carefully if they overlap.
+  // The output is a list of pairs where each pair (a, b) comes from
+  // from matrices1 and matrices2 in the same position, i.e.
+  // a = matrices1[i] and b = matrices2[i].
+  static void GetMatrixSwapOrder(
+      const std::vector<int32> &matrices1,
+      const std::vector<int32> &matrices2,
+      std::vector<std::pair<int32, int32> > *swaps);
+
+
+
+  /// Given a list of command indexes ('segment_end_commands') which are
+  /// expected to be command indexes of the kNoOperationMarker at segment
+  /// boundaries, this function outputs for each of these command indexes a list
+  /// of matrices which are 'active' at that point in time.  By 'active' we mean
+  /// that the matrix has been written to before that time (note, we don't count
+  /// initialization with zeros as being written to); and will be read after
+  /// that time.  These is the list of matrices that 'need to be in scope'
+  /// at those points in time.  '*active_matrices' is indexed by the
+  /// same index as 'segment_end_commands', and is then a list of active
+  /// matrices, in numerical order of matrix index.
+  /// Note: for each i, (*active_matrices)[i] will be sorted and unique.
+  static void FindActiveMatrices(const NnetComputation &computation,
+                                 const Analyzer &analyzer,
+                                 const std::vector<int32> &segment_end_commands,
+                                 std::vector<std::vector<int32> > *active_matrices);
+
+
+  const Nnet &nnet_;
+  NnetComputation *computation_;
+  Analyzer analyzer_;
+  std::vector<std::pair<int32, int32> > matrix_to_pair_;
+
+  std::vector<int32> segment_end_commands_;
+};
+
+// static
+int32 ComputationLoopedOptimizer::FindTimeShift(
+    const NnetComputation &computation,
+    const std::vector<int32> &segment_ends) {
+  KALDI_ASSERT(segment_ends.size() >= 3);
+  // Ignore the first segment as it tends to be a special case
+  // (it has more left context),
+  int32 second_segment_begin = segment_ends[0],
+      third_segment_begin = segment_ends[1],
+      fourth_segment_begin = segment_ends[2];
+  int32 first_output_command_seg2 = -1,
+      first_output_command_seg3 = -1;
+  for (int32 c = second_segment_begin; c < third_segment_begin; c++)
+    if (computation.commands[c].command_type == kProvideOutput &&
+        first_output_command_seg2 < 0)
+      first_output_command_seg2 = c;
+  for (int32 c = third_segment_begin; c < fourth_segment_begin; c++)
+    if (computation.commands[c].command_type == kProvideOutput &&
+        first_output_command_seg3 < 0)
+      first_output_command_seg3 = c;
+  if (first_output_command_seg2 < 0 ||
+      first_output_command_seg3 < 0)
+    KALDI_ERR << "Could not locate output commands for segments 2 and 3.";
+  const NnetComputation::Command
+      &command2 = computation.commands[first_output_command_seg2],
+      &command3 = computation.commands[first_output_command_seg3];
+  int32 seg2_node = command2.arg2, seg3_node = command3.arg2;
+  KALDI_ASSERT(seg2_node == seg3_node);
+  int32 seg2_submatrix = command2.arg1,
+      seg3_submatrix = command3.arg1;
+  KALDI_ASSERT(computation.IsWholeMatrix(seg2_submatrix) &&
+               computation.IsWholeMatrix(seg3_submatrix));
+  int32 seg2_matrix = computation.submatrices[seg2_submatrix].matrix_index,
+      seg3_matrix = computation.submatrices[seg3_submatrix].matrix_index;
+  KALDI_ASSERT(computation.matrices[seg2_matrix].num_rows ==
+               computation.matrices[seg3_matrix].num_rows);
+  KALDI_ASSERT(!computation.matrix_debug_info.empty());
+  const NnetComputation::MatrixDebugInfo
+      &debug_info2 = computation.matrix_debug_info[seg2_matrix],
+      &debug_info3 = computation.matrix_debug_info[seg3_matrix];
+  int32 t_offset = debug_info3.cindexes[0].second.t -
+      debug_info2.cindexes[0].second.t;
+  int32 num_rows = debug_info2.cindexes.size();
+  for (int32 r = 0; r < num_rows; r++) {
+    KALDI_ASSERT(debug_info3.cindexes[r].second.t ==
+                 debug_info2.cindexes[r].second.t + t_offset);
+  }
+  return t_offset;
+}
+
+// static
+void ComputationLoopedOptimizer::CreateMatrixPairs(
+    const NnetComputation &computation,
+    std::vector<std::pair<int32, int32> > *matrix_to_pair) {
+  typedef unordered_map<std::vector<Cindex>, int32,
+                        CindexVectorHasher> MapType;
+  int32 cur_vector_id = 1;
+  // Note: cindex_map just maps the vector<Cindex> to a unique value,
+  // and then we manually work out a unique id that takes into
+  // account the 'is_deriv' values.
+  MapType cindex_map;
+  int32 num_matrices = computation.matrices.size();
+  matrix_to_pair->resize(num_matrices);
+  KALDI_ASSERT(computation.matrix_debug_info.size() == num_matrices);
+  for (int32 m = 1; m < num_matrices; m++) {
+    KALDI_ASSERT(!computation.matrix_debug_info[m].cindexes.empty());
+    std::vector<Cindex> cindexes = computation.matrix_debug_info[m].cindexes;
+    int32 t_offset = cindexes[0].second.t;
+    for (std::vector<Cindex>::iterator iter = cindexes.begin();
+         iter != cindexes.end(); ++iter)
+      iter->second.t -= t_offset;
+    MapType::const_iterator iter = cindex_map.find(cindexes);
+    int32 vector_id;
+    if (iter != cindex_map.end()) {
+      vector_id = iter->second;
+    } else {
+      vector_id = cur_vector_id++;
+      cindex_map[cindexes] = vector_id;
+    }
+    bool is_deriv = computation.matrix_debug_info[m].is_deriv;
+    int32 unique_id = 2 * vector_id + (is_deriv ? 1 : 0);
+    (*matrix_to_pair)[m].first = unique_id;
+    (*matrix_to_pair)[m].second = t_offset;
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::GetPairToMatrixMap(
+      std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix) {
+  int32 num_matrices = matrix_to_pair.size();
+  // actually there are one fewer matrices than num_matrices.
+  pair_to_matrix->clear();
+  for (int32 m = 1; m < num_matrices; m++)
+    (*pair_to_matrix)[matrix_to_pair[m]] = m;
+}
+
+
+// static
+void ComputationLoopedOptimizer::ConvertListsToPairLists(
+      const std::vector<std::vector<int32> > &active_matrices,
+      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs) {
+  active_pairs->clear();
+  active_pairs->resize(active_matrices.size());
+  int32 num_matrices = matrix_to_pair.size();
+  for (size_t seg = 0; seg < active_matrices.size(); seg++) {
+    const std::vector<int32> &this_active_matrix_list = active_matrices[seg];
+    std::vector<std::pair<int32, int32> > &this_active_pair_list =
+        (*active_pairs)[seg];
+    this_active_pair_list.resize(this_active_matrix_list.size());
+    std::vector<int32>::const_iterator iter = this_active_matrix_list.begin(),
+        end = this_active_matrix_list.end();
+    std::vector<std::pair<int32, int32> >::iterator
+        out_iter = this_active_pair_list.begin();
+    for (; iter != end; ++iter, ++out_iter) {
+      KALDI_ASSERT(*iter > 0 && *iter < num_matrices);
+      *out_iter = matrix_to_pair[*iter];
+    }
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::NormalizePairLists(
+    std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
+    std::vector<int32> *time_offsets) {
+  int32 num_segments = active_pairs->size();
+  time_offsets->resize(num_segments);
+  for (int32 seg = 0; seg < num_segments; seg++) {
+    std::vector<std::pair<int32, int32> > &this_pairs = (*active_pairs)[seg];
+    std::sort(this_pairs.begin(), this_pairs.end());
+    int32 this_offset;
+    if (!this_pairs.empty()) {
+      this_offset = this_pairs[0].second;
+    } else {
+      // if this_pairs is empty, produce arbitrary offsets that are increasing
+      // (this will keep some self-testing code happy).
+      if (seg == 0) { this_offset = 0; }
+      else { this_offset = (*time_offsets)[seg - 1] + 1; }
+    }
+    (*time_offsets)[seg] = this_offset;
+    std::vector<std::pair<int32, int32> >::iterator
+        iter = this_pairs.begin(), end = this_pairs.end();
+    for (; iter != end; ++iter)
+      iter->second -= this_offset;
+  }
+}
+
+
+// static
+bool ComputationLoopedOptimizer::FindFirstRepeat(
+    const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
+    const std::vector<int32> &time_offsets,
+    int32 time_shift_per_segment,
+    int32 *seg1, int32 *seg2) {
+  int32 num_segments = normalized_active_pairs.size();
+  // This algorithm may seem like it would be very slow, but the number of
+  // segments will normally be quite small (e.g. 10), and the comparison of
+  // elements of 'normalized_active_pairs' should be fast in cases where they
+  // differ.
+  KALDI_ASSERT(num_segments >= 2);
+
+  bool perform_time_offset_check = true;
+  if (normalized_active_pairs.back().empty()) {
+    // If there are no variables active after the end of the last-but-one segment
+    // (which is the last element in segment_ends, since we remove the end of the
+    // very last segment), then don't perform the check related to
+    // time-offsets, it's not relevant.  [this would probably be a computation
+    // that doesn't require any context].
+    perform_time_offset_check = false;
+  }
+  for (int32 s = 0; s < num_segments; s++) {
+    for (int32 t = s + 1; t < num_segments; t++) {
+      if ((!perform_time_offset_check ||
+           time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) &&
+          normalized_active_pairs[s] == normalized_active_pairs[t]) {
+        *seg1 = s;
+        *seg2 = t;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// static
+void ComputationLoopedOptimizer::PairListToMatrixList(
+    const std::vector<std::pair<int32, int32> > &pair_list,
+    const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
+    std::vector<int32> *matrix_list) {
+  matrix_list->resize(pair_list.size());
+  std::vector<std::pair<int32, int32> >::const_iterator
+      iter = pair_list.begin(), end = pair_list.end();
+  std::vector<int32>::iterator out_iter = matrix_list->begin();
+  for (; iter != end; ++iter, ++out_iter) {
+    unordered_map<std::pair<int32, int32>, int32,
+                  PairHasher<int32> >::const_iterator
+        map_iter = pair_to_matrix.find(*iter);
+    if (map_iter == pair_to_matrix.end()) {
+      KALDI_ERR << "Could not find pair in map (code error)";
+    }
+    *out_iter = map_iter->second;
+  }
+}
+
+
+
+// static
+void ComputationLoopedOptimizer::FindActiveMatrices(
+    const NnetComputation &computation,
+    const Analyzer &analyzer,
+    const std::vector<int32> &segment_end_commands,
+    std::vector<std::vector<int32> > *active_matrices) {
+  int32 num_matrices = computation.matrices.size();
+  int32 num_segments = segment_end_commands.size();
+  active_matrices->clear();
+  active_matrices->resize(num_segments);
+  // this object just makes available some extra functions, vs. the Analyzer
+  // object.
+  ComputationAnalysis analysis(computation, analyzer);
+  KALDI_ASSERT(IsSortedAndUniq(segment_end_commands));
+
+  // the following vector gives us, for each matrix index, a submatrix index
+  // that covers the whole of that matrix (needed by interface of 'analysis' object).
+  std::vector<int32> whole_submatrices;
+  computation.GetWholeSubmatrices(&whole_submatrices);
+  for (int32 m = 1; m < num_matrices; m++) {
+    // the following are command indexes, comparable with the indexes
+    // in 'segment_end_commands'.
+    int32 s = whole_submatrices[m],  // submatrix consisting of the whole of
+                                     // 'm'.
+        first_access = analysis.FirstAccess(s),
+        last_access = analysis.LastAccess(s);
+    for (int32 seg = 0; seg < num_segments; seg++) {
+      int32 segment_end = segment_end_commands[seg];
+      if (first_access < segment_end && last_access > segment_end) {
+        // If the block of time during which the matrix is accessed, includes
+        // this segment end-point, then the matrix is considered 'active' at
+        // that time.
+        (*active_matrices)[seg].push_back(m);
+      }
+    }
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::CheckIdentifiedMatrices(
+    const NnetComputation &computation,
+    const std::vector<int32> &list1,
+    const std::vector<int32> &list2,
+    int32 time_difference) {
+  KALDI_ASSERT(time_difference > 0);
+  KALDI_ASSERT(list1.size() == list2.size());
+  KALDI_ASSERT(!computation.matrix_debug_info.empty());
+  for (size_t i = 0; i < list1.size(); i++) {
+    int32 m1 = list1[i], m2 = list2[i];
+    const NnetComputation::MatrixInfo
+        &matrix_info1 = computation.matrices[m1],
+        &matrix_info2 = computation.matrices[m2];
+    KALDI_ASSERT(matrix_info1.num_rows == matrix_info2.num_rows &&
+                 matrix_info1.num_cols == matrix_info2.num_cols &&
+                 matrix_info1.stride_type == matrix_info2.stride_type);
+    const NnetComputation::MatrixDebugInfo
+        &debug_info1 = computation.matrix_debug_info[m1],
+        &debug_info2 = computation.matrix_debug_info[m2];
+    KALDI_ASSERT(debug_info1.is_deriv == debug_info2.is_deriv);
+    KALDI_ASSERT(debug_info1.cindexes.size() == debug_info2.cindexes.size());
+    std::vector<Cindex>::const_iterator iter1 = debug_info1.cindexes.begin(),
+        end1 = debug_info1.cindexes.end(),
+        iter2 = debug_info2.cindexes.begin();
+    for (; iter1 != end1; iter1++,iter2++) {
+      KALDI_ASSERT(iter2->first == iter1->first &&
+                   iter2->second.n == iter1->second.n &&
+                   iter2->second.t == iter1->second.t + time_difference &&
+                   iter2->second.x == iter1->second.x);
+    }
+  }
+}
+
+
+// static
+void ComputationLoopedOptimizer::GetMatrixSwapOrder(
+    const std::vector<int32> &matrices1,
+    const std::vector<int32> &matrices2,
+    std::vector<std::pair<int32, int32> > *swaps) {
+  KALDI_ASSERT(matrices1.size() == matrices2.size());
+  swaps->clear();
+  int32 num_matrices = matrices1.size();
+  std::vector<bool> processed(num_matrices, false);
+  std::vector<int32> queue;
+
+  // num_loops is just for infinite-loop detection.
+  int32 num_loops = 0;
+  for (; static_cast<int32>(swaps->size()) < num_matrices; num_loops++) {
+    for (int32 i = 0; i < num_matrices; i++) {
+      if (processed[i])
+        continue;
+      int32 m1 = matrices1[i], m2 = matrices2[i];
+      std::vector<int32>::const_iterator iter =
+          std::lower_bound(matrices2.begin(), matrices2.end(), m1);
+      if (iter == matrices2.end() || *iter != m1) {
+        // Matrix m1 does not appear in the list 'matrices2', so
+        // we are safe to process it at any time.
+        swaps->push_back(std::pair<int32,int32>(m1, m2));
+        processed[i] = true;
+      } else {
+        int32 m1_pos_in_matrices2 = iter - matrices2.begin();
+        if (processed[m1_pos_in_matrices2]) {
+          // We're safe to do this swap now, because the matrix m1 has already
+          // appeared on the RHS of a swap, and by this point has been
+          // deallocated, in effect.
+          swaps->push_back(std::pair<int32,int32>(m1, m2));
+          processed[i] = true;
+        }
+        // else do nothing, we cannot process m1 yet because
+        // at this point in the computation it is still allocated.
+      }
+    }
+    // The following assert is to check that we don't loop infinitely.  We can
+    // prove that infinite looping won't happen, after on proving that there can
+    // be no cycles like (m1, m2), (m2, m3), (m3, m1) (the length of 3 is chosen
+    // arbitrarily as an example).  If such a cycle existed, we can reach a
+    // contradiction based on the time-index (t) of the first cindex in m1.
+    // Define t1 = that time index, t2 the same for m2, t3 the same for m3.  The
+    // existence of the three pairs [as pairs like (matrices1[i], matrices2[i])]
+    // implies that t2 > t1, t3 > t2, and t1 > t3 respectively, but this is
+    // impossible.
+    // This shows that all chains of dependencies must terminate.
+    KALDI_ASSERT(num_loops <= num_matrices);
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::AddMatrixSwapCommands(
+    const std::vector<int32> &matrices1,
+    const std::vector<int32> &matrices2,
+    NnetComputation *computation) {
+  std::vector<std::pair<int32, int32> > swaps;
+  // Note: in 'easy' cases where matrices1 and matrices2 are disjoint,
+  // 'swaps' will just be the vector { (matrices1[0],matrices2[0]),
+  // (matrices1[1],matrices2[1]), ... },
+  // but in some cases these may need to get reordered.
+  GetMatrixSwapOrder(matrices1, matrices2, &swaps);
+
+  NnetComputation::Command goto_label_command = computation->commands.back();
+  KALDI_ASSERT(goto_label_command.command_type == kGotoLabel);
+  computation->commands.pop_back();
+
+  // the following vector gives us, for each matrix index, a submatrix index
+  // that covers the whole of that matrix (needed because the commands
+  // require submatrix indexes)
+  std::vector<int32> whole_submatrices;
+  computation->GetWholeSubmatrices(&whole_submatrices);
+  size_t num_matrices = whole_submatrices.size();
+
+  for (size_t i = 0; i < swaps.size(); i++) {
+    int32 m1 = swaps[i].first, m2 = swaps[i].second;
+    KALDI_ASSERT(static_cast<size_t>(m1) < num_matrices &&
+                 static_cast<size_t>(m2) < num_matrices);
+    int32 s1 = whole_submatrices[m1], s2 = whole_submatrices[m2];
+    computation->commands.push_back(
+        NnetComputation::Command(
+            kAllocMatrixFromOther, s1, s2));
+  }
+  computation->commands.push_back(goto_label_command);
+}
+
+// static
+void ComputationLoopedOptimizer::FormInfiniteLoop(
+    int32 command1, int32 command2,
+    NnetComputation *computation) {
+  KALDI_ASSERT(static_cast<int32>(computation->commands.size()) >=
+               command2 + 1 && command1 < command2);
+  KALDI_ASSERT(
+      computation->commands[command1].command_type == kNoOperationMarker &&
+      computation->commands[command2].command_type == kNoOperationMarker);
+  // Remove any commands after 'command2'.
+  computation->commands.resize(command2 + 1);
+  computation->commands[command2].command_type = kGotoLabel;
+  computation->commands[command2].arg1 = command1;
+  NnetComputation::Command c(kNoOperationLabel);
+  computation->commands.insert(computation->commands.begin() + command1,
+                               c);
+  // Now the kNoOperationLabel command is at position 'command1'.
+}
+
+
+
+bool ComputationLoopedOptimizer::Optimize() {
+  analyzer_.Init(nnet_, *computation_);
+  KALDI_ASSERT(!computation_->matrix_debug_info.empty() &&
+               "You must request matrix debug info when compiling "
+               "looped computations.");
+
+  // get the indexes of the separator commands at the ends of segments.
+  std::vector<int32> segment_ends;
+  GetSegmentEnds(*computation_, &segment_ends);
+  int32 time_shift_per_segment = FindTimeShift(*computation_,
+                                               segment_ends);
+
+  // Ignore the end of the very last segment- it is not a candidate for a
+  // 'splice point'.  What we're doing here is like creating a tape loop; we
+  // have to find a place where the list of variables is the same except for a
+  // time offset.
+  // [note: it's not exactly like a tape loop because the prologue can
+  // vary... the sequence is of the form like a b b b b b .. ]
+  segment_ends.pop_back();
+
+
+  std::vector<std::vector<int32> > active_matrices;
+  // Find the list of matrices active at each of those segment-end-command
+  // times.
+  FindActiveMatrices(*computation_, analyzer_, segment_ends,
+                     &active_matrices);
+
+  // Find a representation of the matrices of the computation as pairs
+  // (unique_id, time_offset) that are more amenable to finding
+  // matrices that represet lists of Cindexes that differ only by
+  // a time offset.
+  std::vector<std::pair<int32, int32> > matrix_to_pair;
+  CreateMatrixPairs(*computation_, &matrix_to_pair);
+
+  // Create the reverse map from pair to matrix index; we'll need it.
+  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > pair_to_matrix;
+  GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix);
+
+  // get lists of matrix per segment in the pair representation.
+  std::vector<std::vector<std::pair<int32, int32> > > pair_lists;
+  ConvertListsToPairLists(active_matrices, matrix_to_pair,
+                          &pair_lists);
+
+  std::vector<int32> time_offsets;
+  NormalizePairLists(&pair_lists, &time_offsets);
+
+  // Note: seg1 and seg2 are indexes into 'segment_ends', representing
+  // points in time (that happen to be the ends of segments).
+  int32 seg1, seg2;
+  if (!FindFirstRepeat(pair_lists,
+                       time_offsets,
+                       time_shift_per_segment,
+                       &seg1, &seg2)) {
+    KALDI_VLOG(2) << "Could not find repeats of variables.";
+    return false;
+  }
+
+  // reverse the normalization for segments seg1 and seg2.
+  for (size_t i = 0; i < pair_lists[seg1].size(); i++)
+    pair_lists[seg1][i].second += time_offsets[seg1];
+  for (size_t i = 0; i < pair_lists[seg2].size(); i++)
+    pair_lists[seg2][i].second += time_offsets[seg2];
+  std::vector<int32> seg1_matrices, seg2_matrices;
+  PairListToMatrixList(pair_lists[seg1], pair_to_matrix, &seg1_matrices);
+  PairListToMatrixList(pair_lists[seg2], pair_to_matrix, &seg2_matrices);
+
+  int32 time_difference = time_offsets[seg2] - time_offsets[seg1];
+  CheckIdentifiedMatrices(*computation_, seg1_matrices, seg2_matrices,
+                          time_difference);
+
+
+  FormInfiniteLoop(segment_ends[seg1], segment_ends[seg2], computation_);
+
+  AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_);
+
+  RenumberComputation(computation_);
+
+  FixGotoLabel(computation_);
+
+  return true;
+}
+
+
+void OptimizeLoopedComputation(const Nnet &nnet,
+                               NnetComputation *computation) {
+  ComputationLoopedOptimizer optimizer(nnet, computation);
+  optimizer.Optimize();
+}
+
+
+
+void FixGotoLabel(NnetComputation *computation) {
+  int32 num_commands = computation->commands.size();
+  if (num_commands == 0)
+    return;
+  for (int32 c = num_commands - 1; c >= 0; c--) {
+    if (computation->commands[c].command_type == kGotoLabel) {
+      int32 dest_command = computation->commands[c].arg1;
+      if (static_cast<size_t>(dest_command) <  computation->commands.size() &&
+          computation->commands[dest_command].command_type == kNoOperationLabel)
+        return;  // nothing to fix.
+      for (int32 d = 0; d + 1 < num_commands; d++) {
+        if (computation->commands[d].command_type == kNoOperationLabel) {
+          computation->commands[c].arg1 = d;
+          return;
+        }
+      }
+      KALDI_ERR << "Label not found.";
+    } else if (computation->commands[c].command_type == kProvideOutput) {
+      // sometimes kProvideOutput commands are temporarily ordered after
+      // the kGotoLabel command, and we need to work in that case.
+      continue;
+    } else {
+      // it loks like there is no 'goto' command in this computation-
+      // if there were, it would be right at the end, possibly followed by
+      // kProvideOutput commands.
+      break;
+    }
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index e224983f847..9977ca8952a 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -52,14 +52,11 @@ struct NnetOptimizeOptions;  // Forward declaration.
    may be sub-matrices of larger matrices.
 
    Note: the following
-     - Define last-access(submatrix) as:
-       If matrix-of(submatrix) is an output, then num-commands, otherwise the
+     - Define last-access(submatrix) as the
        last command that accesses that submatrix for either read or write.  [note:
        deallocation does not count as a read or write operation].
-     - Define first-access(submatrix) as:
-       If matrix-of(submatrix) is an input, then -1, otherwise the first command
-       that is *not* an allocation command that accessed that submatrix for either
-       read or write.
+     - Define first-access(submatrix) as the first command not of type kAlloc*
+       that accessed that submatrix for either read or write.
      - Define last-write-access(submatrix) as the last command-index that accessed
        the submatrix in a write operation, or -1 if there is no such command (this
        could happen for inputs).
@@ -99,53 +96,41 @@ struct NnetOptimizeOptions;  // Forward declaration.
    Otherwise (cases (b) and (c), in-place propagate or backprop), we insist that:
      - first-access(s2) == C
      - last-access(s1) == C
-   Note: in either case, these conditions imply that s2 is not an input and s1 is
-   not an output.
+   Note: in either case, these conditions imply that m2/s2 is not an input and m1/s1 is
+   not an output.  [i.e. s1 *may* be an input and s2 *may* be an output].
+
+   We can explain the procedure for both left-merge and right-merge in one, because
+   it's the same.  Define s_to_keep and m_to_keep as s1 and m1 if we're left-merging
+   and s2 and m2 if we're right-merging, and s_to_discard and m_to_discard the opposite
+   way.
+
+   The procedure to merge in general is as follows:
 
-   The sequence of things we have to do for a right-merge (in which we delete
-   s1,m1) is as follows:
      - All submatrices that reference m1, make them reference m2 instead.
-       [later we'll renumber so that there are no duplicates.]
-     - If m1 was an input, replace it as an input with m2 and remove the
-       command that allocated m2.
-     - If it was an assignment [case (a)], replace the assignment command with a
-       no-op.
-     - If both m1 and m2 have commands that allocate them, keep only the
-       allocation command for m2, and make sure that it zeroes the data (we can
-       later change to undefined if allowed) and that it's before the first
-       non-allocation access of m1.  Otherwise remove any allocation commands
-       (the merged variable is an input).
-     - If both m1 and m2 have commands that deallocate them, keep only the
-       deallocation command for m2, and make sure that it's after the last
-       access of m1 (otherwise delete any deallocation command, because m2 must
-       be an output).  [note: previously we kept the later of the 2 commands,
-       but this had the effect of making inaccurate the Analyzer info for
-       a matrix (m2) that might later be used.
-     - If m1 had stride_type == kStrideEqualNumCols, set m2's stride_type
-       to kStrideEqualNuMCols.
-
-
-   The sequence of things we have to do for a right-merge (in which we delete
-   s1,m1) is as follows:
-     - All submatrices that reference m2, make them reference m1 instead.
-       [later we'll renumber so that there are no duplicates.]
-     - If m2 was an output, replace it as an output with m1 and remove the
-       command that deallocated m1.
-     ... the last four bullet-points, regarding removing the assignment command,
-        and allocation and deallocation, and stride-type, are the same as for a
-        left-merge, except swap m1 and m2.
+       [later we'll renumber so that there are no duplicates.]  This automatically
+       takes care of making the input and output and allocation/deallocation
+       commands refer to the right matrix, in most cases.
+     - We need to get rid of duplicate or unnecessary allocation commands:
+       If m_to_discard is an input then get rid of the allocation command for
+       m_to_keep; otherwise get rid of the allocation command of m_to_discard.
+     - We need to get rid of duplicate or unnecessary deallocation commands:
+       If m_to_discard is an output then get rid of the deallocation command
+       for m_to_keep; otherwise get rid of the deallocation command for
+       m_to_discard.
 
    At the end when we call RemoveOrphanMatrices(), the renumbering code will
    automatically detect that there are duplicate submatrices, and will merge
    them, as well as removing the now-unused matrix indexes.  After merging, we
    will mark the variables (i.e. row-ranges) underlying s1 and s2 as being
-   "dirty" so they can no longer be merged during the lifetime of this class.
+   "dirty" so they can no longer be merged during the lifetime of this class--
+   this is so we don't have to think to hard; we apply this optimization
+   multiple times until it makes no change (see
+   nnet-optimize.cc:VariableMerginOptimization()).
  */
 class VariableMergingOptimizer {
  public:
   VariableMergingOptimizer(const NnetOptimizeOptions &config,
                            const Nnet &nnet,
-                           const ComputationRequest &request,
                            NnetComputation *computation);
   // Note: you can call this only once.  If it returns true, it means it has
   // merged variables.  In this case, you have the option to instantiate another
@@ -170,20 +155,10 @@ class VariableMergingOptimizer {
   ///  @param s2   [in]     A submatrix-index s2 > 0
   std::pair<bool,bool> MayBeMerged(int32 command, int32 s1, int32 s2) const;
 
-  // performs the left merge.  Search for left-merge in the comment
-  // above the class declaration for details.
-  void DoLeftMerge(int32 command_index, int32 s1, int32 s2);
-
-  // performs the right merge.  Search for right-merge in the comment
-  // above the class declaration for details.
-  void DoRightMerge(int32 command_index, int32 s1, int32 s2);
-
-  // Performs the actions common to both left and right merges, regarding
-  // removing the assignment command, and allocation and deallocation (called
-  // from DoLeftMerge and DoRightMerge).  The m_to_keep and m_to_discard
-  // are the matrix-indexes we will keep and discard respectively.
-  void DoMergeCommon(int32 command_index, int32 m_to_keep,
-                     int32 m_to_discard);
+  // Merges to matrices, whether left merge or right merge.  s_to_keep and
+  // s_to_discard are the submatrix-indexes we will keep and discard
+  // respectively (these are s1 and s2 in some order.
+  void DoMerge(int32 command_index, int32 s_to_keep, int32 m_to_discard);
 
   /// Marks the variables underlying submatrix 's' as dirty
   void MarkAsDirty(int32 s);
@@ -192,7 +167,6 @@ class VariableMergingOptimizer {
 
   const NnetOptimizeOptions &config_;
   const Nnet &nnet_;
-  const ComputationRequest &request_;
   NnetComputation *computation_;
 
   Analyzer analyzer_;
@@ -208,184 +182,29 @@ class VariableMergingOptimizer {
 };
 
 
-/** This class is responsible for consolidating the model-update part of
-    backprop commands, for components in (e.g.) recurrent networks that need to
-    have many separate backprop commands, into more efficient single commands
-    operating on consolidated data in larger matrices.  This is useful for
-    recurrent networks.  */
-class ModelUpdateConsolidator {
- public:
-  ModelUpdateConsolidator(const Nnet &nnet,
-                          NnetComputation *computation);
-  void ConsolidateModelUpdate();
- private:
-  void ConsolidateUpdateForComponent(
-      int32 component,
-      const std::vector<int32> &backprop_commands);
-
-  /// This function, called at the end of ConsolidateModelUpdate(), takes the
-  /// commands that we have put in extra_commands_, final_commands_ and
-  /// final_deallocate_commands_, and puts them in the appropriate place in
-  /// computation->commands_.
-  void AddCommandsToComputation();
-
-  /// You call this function when you want to consolidate the values of a list
-  /// of submatrices taken just prior to particular commands.  The input
-  /// 'commands' and 'submatrices' lists must be the same size, and size must be
-  /// > 1.  This function will create a new matrix that is the row-wise
-  /// concatentation of all these submatrices, with values taken just prior to
-  /// the respective command indexes.  This function will will add to
-  /// extra_commands_ the commands to do the copying at the appropriate places
-  /// (at the supplied command indexes; they will be inserted just before).  The
-  /// return value is the submatrix index of a submatrix that represents the
-  /// whole of the consolidated matrix.  This command will insert, at the
-  /// beginning of the computation (in extra_commands_[0]), a command to
-  /// initialize the matrix; and will append to final_deallocate_commands_ the
-  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
-  /// nonempty, this function will also update computation_->matrix_debug_info
-  /// with suitable values for the newly added matrix
-  int32 ConsolidateSubmatrices(
-      const std::vector<int32> &commands,
-      const std::vector<int32> &submatrices);
-
-  /// This function, called from ConsolidateSubmatrices, will
-  /// update 'debug_info' by appending the corresponding 'indexes' from
-  /// the existing debug info for this submatrix.  It will also set
-  /// the 'is_deriv' of '*debug_info' to the same value as the
-  /// debug info for 'submatrix_index', and set the 'node_index' to the
-  /// 'node_index' in the debug info for that submatrix-index.
-  /// It requires that computation_->matrix_debug_info be nonempty.
-  void AppendDebugInfoForSubmatrix(
-      int32 submatrix_index,
-      NnetComputation::MatrixDebugInfo *debug_info) const;
-
-  const Nnet &nnet_;
-  NnetComputation *computation_;
-
-  // Indexed by the original command index in *computation_ (and sized to the
-  // original number of commands in *computation_ before we added anything),
-  // extra_commands_[c] contains a list of commands that need to be inserted
-  // just before command c in the previously existing computation.
-  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
-
-  // This is as list of kBackprop commands that will be placed after the
-  // commands in 'computation_->commands' and 'extra_commands_', but before
-  // the 'final_deallocate_commands_'.
-  std::vector<NnetComputation::Command> final_commands_;
-  // This is a list of commands to deallocate our 'consolidated' matrices; the
-  // commands will be placed after the commands in 'final_commands_'.
-  std::vector<NnetComputation::Command> final_deallocate_commands_;
-};
-
-
-// We declare this class in the .cc file, we don't need to export it.
-// It's used inside RenumberComputation.
-class ComputationRenumberer {
- public:
-  ComputationRenumberer(NnetComputation *computation):
-      computation_(computation) { }
-
-  void Renumber();
- private:
-  // this function removes unused vectors within the indexes_multi_ array, i.e.
-  // ones that are not referenced in the computation.
-  void RemoveUnusedIndexesMulti();
-  // this function computes the submatrix_is_used_ vector, saying whether each
-  // of the original submatrices is referenced somewhere.
-  void ComputeSubmatrixIsUsed();
-  // this function computes the matrix_is_used_ vector (from the
-  // submatrix_is_used_ vector, from computation_->input_output_info, and from
-  // computation_->commands, saying whether each of the original matrices is
-  // referenced somewhere, directly or indirectly.
-  void ComputeMatrixIsUsed();
-  // This function sets up mappings from old to new matrix and submatrix indexes,
-  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
-  void SetUpMappings();
-  // This function renumbers submatrix indexes appearing within commands and
-  // indexes_multi_, and then removes unused submatrices from the list of
-  // submatrices while leaving the matrix-indexes at their old values (they will
-  // be mapped by RenumberMatrices()).
-  void RenumberSubmatrices();
-  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
-  // and 'input_output_info'; renumber 'matrices' and if applicable
-  // 'debug_info'.
-  void RenumberMatrices();
-  // removes duplicates within the indexes_multi array itself.
-  void RemoveIndexesMultiDuplicates();
-  // removes unused elements and duplicates within 'computation->indexes'
-  void RenumberIndexes();
-  // removes unused elements and duplicates within 'computation->indexes_ranges'
-  void RenumberIndexesRanges();
-
-  struct SubMatrixHasher {
-    SubMatrixHasher() { }
-    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
-      // these numbers are arbitrarily chosen primes.
-      return submat.matrix_index +
-          19553 * submat.row_offset +
-          29297 * submat.num_rows +
-          42209 * submat.col_offset +
-          56527 * submat.num_cols;
-    }
-  };
 
+/**
+   This optimization consolidates
+   the model-update part of
+   backprop commands, for components in (e.g.) recurrent networks that need to
+   have many separate backprop commands, into more efficient single commands
+   operating on consolidated data in larger matrices.  This is useful for
+   recurrent networks.  The resulting computation separates the backprop for
+   data-derivatives from the model-update part of backprop.
+ */
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            NnetComputation *computation);
 
-  // Here, T will be int32 or std::pair<int32,int32>
-  template <class T>
-  struct PointerCompare {
-    // This provides an operator < on two vectors of ints or pairs of ints.  It
-    // is designed to provide a total order on the vectors while accessing as
-    // small a portion of the vectors' data as possible.  It's used in removing
-    // duplicates from computation_->indexes_multi and computation_->indexes.
-    // First it compares the length, then it does lexicographical compare.
-    bool operator ()(const std::vector<T> *ptr1,
-                     const std::vector<T> *ptr2) const {
-      size_t size1 = ptr1->size(), size2 = ptr2->size();
-      if (size1 < size2) return true;
-      else if (size1 > size2) return false;
-      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
-                                    // lexicographical comparison.
-    }
-  };
 
-  /// creates a renumbering that removes the elements in "to_remove",
-  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
-  /// the vector [ 0, -1, 1 ].
-  static void CreateRenumbering(int32 old_num_elements,
-                                const std::vector<int32> &to_remove,
-                                std::vector<int32> *renumbering);
-
-  /// creates a renumbering from old to new index that removes the unused
-  /// elements, e.g. if used == [ true, false, true, true], would output the
-  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
-  /// number of elements of 'used' that were true.
-  static int32 CreateRenumbering(const std::vector<bool> &used,
-                                 std::vector<int32> *renumbering);
-
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index is used somewhere in the computation (always true for
-  // the zeroth element).
-  std::vector<bool> submatrix_is_used_;
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index will be kept; this is like submatrix_is_used_; but for
-  // duplicate submatrices, all but the first duplicate will be marked false).
-  std::vector<bool> submatrix_is_kept_;
-  // vector of bool indexed by original-matrix-index > 0, that is true if a
-  // matrix-index is used somewhere in the computation, directly or indirectly.
-  // always true for the zeroth element.
-  std::vector<bool> matrix_is_used_;
-  NnetComputation *computation_;
-  int32 num_matrices_new_;
-  int32 num_submatrices_new_;
-  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
-                                         // new-matrix-index.  -1 for removed
-                                         // ones.
-  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
-                                            // gives new-submatrix-index.  -1
-                                            // for removed ones.
-};
 
 
+// Class DerivativeTimeLimiter is used inside LimitDerivativeTimes().
+// Its function is to modify the computation so that we don't work
+// with derivatives outside of a specified range of t values; this is
+// useful, for instance, in BLSTMs where you might have a fair amount of
+// left and right context in the training examples but don't want to
+// propagate the derivatives to there.
+//
 // We require that the computation have debug info set up
 // (!matrix_debug_info.empty()) and that this be the first
 // optimization you perform.  This means that the debug_info will
@@ -402,11 +221,6 @@ class DerivativeTimeLimiter {
 
  private:
 
-  // This command ensures that for each matrix m there is a corresponding
-  // submatrix that spans the entire matrix, and stores its index in
-  // entire_submatrix_[m].
-  void EnsureMatricesHaveEntireSubmatrices();
-
   // sets up matrix_prune_info_.
   void ComputeMatrixPruneInfo();
 
@@ -502,7 +316,7 @@ class DerivativeTimeLimiter {
 
   // for each matrix index > 0, the index of a submatrix that consists of
   // the entirety of that matrix.
-  std::vector<int32> entire_submatrix_;
+  std::vector<int32> whole_submatrices_;
 
   std::vector<MatrixPruneInfo> matrix_prune_info_;
 
@@ -537,7 +351,6 @@ void LimitDerivativeTimes(const Nnet &nnet,
                           int32 max_deriv_time,
                           NnetComputation *computation);
 
-
 /**  This function, used in 'shortcut' compilation where we first compile a
      smaller computation with the same structure but only 2 distinct 'n'
      values, works out whether a computation is 'decomposable'; if so,
@@ -554,57 +367,88 @@ void LimitDerivativeTimes(const Nnet &nnet,
         'regular' structure, is as follows:
           - The 't' and 'x' values present are the same for each 'n',
           - The order in which the indexes appear is EITHER of the following:
-             - The 'n' varies the most rapidly, i.e. the order is:
+             - The 'n' index varies 'fast', i.e. the order is:
                  (t1,x1,0), (t1,x1,1) ... (t1,x1,N-1) \
                  (t2,x2,0), (t2,x2,1) ... (t2,x2,N-1)  ...
-             - The 'n' varies the least rapidly, i.e. the order is:
+             - The 'n' index varies 'slowly', i.e. the order is:
                  (t1,x1,0), (t2,x2,0) ...  \
                  (t1,x1,1), (t2,x2,1) ...  \
                  ...                       \
                  (t1,x2,N-1), (t2,x2,N-1) ...
             In either case, there does not have to be any particular rhyme or
-            reason to the order of the t and x values, the regularity on 'n' is
+            reason to the order of the t and x values; the regularity on 'n' is
             all that we care about.
  */
 bool ComputationIsDecomposable(const ComputationRequest &request,
                                ComputationRequest *mini_request,
-                               int32 *num_n_values);
-
-/**
-  This function is used in 'shortcut' compilation
- */
-bool ExpandComputation(const Computation &computation,
-                       int32 num_n_vlues,
-                       Computation *expanded_computation)
-
+                               int32 *num_n_values);  // TODO: implement this.
 
 
+/**
+  This function is used in 'shortcut' compilation to expand a computation
+  that has been compiled for exactly 2 'n' values, to one that is suitable
+  for some num_n_values > 2.
+     @param [in] nnet         The neural network for which this computation
+                              is being built.
+     @param [in] misc_info    The same MiscComputationInfo object that was
+                              present in the ComputationRequests that were
+                              originally used to generate the computation
+                              (required to generated the PrecomputedIndexes)
+     @param [in] computation  The computation that was compiled for exactly
+                              2 'n' values (n=0 and n=1)
+     @param [in] need_debug_info True if we want to retain the 'debug_info'
+                              in the output 'expanded_computation'.  In any
+                              case, the 'debug_info' is required in the
+                              input computation.
+     @param [in] num_n_values The number of 'n' values we want in the output
+                              computation
+     @param [out] expanded_computation  The expanded computation.
 
-/// This function detects submatrices, matrices, and members of indexes_multi
-/// and indexes that are never used (e.g. due to changes made in other
-/// optimization code), and removes them from the computation by way of suitable
-/// renumbering.  It does not remove no-ops from computation->commands_; to do
-/// that, call RemoveNoOps(computation).
+ */
+void ExpandComputation(const Nnet &nnet,
+                       const MiscComputationInfo &misc_info,
+                       const NnetComputation &computation,
+                       bool need_debug_info,
+                       int32 num_n_values,
+                       NnetComputation *expanded_computation);
+
+
+
+/// This function detects cases where commands of type kCopyRows, kAddRows or
+/// kAddToRows can be converted to commands of type kMatrixCopy or kMatrixAdd,
+/// and converts them (this may involve adding submatrices).
+///
+/// This function returns true if it made any changes to the computation; if it
+/// returns true, then after doing this you should at some point do
+/// RenumberComputation(), which will remove any now-unused members of
+/// computation->indexes.
+bool ReplaceRowWithMatrixOps(NnetComputation *computation);
+
+/// This function detects cases where commands of type kCopyRows, kAddRows,
+/// kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti or
+/// kAddRowRanges use indexes that start or end with -1's or equivalents,
+/// and replace them with similar commands that act on a sub-matrix of the
+/// matrices they are currently acting on.  This will help efficiency by
+/// avoiding launching unnecessary copies of the kernel (that don't really
+/// have to do anything).
+///
+/// This function returns true if it made any changes to the computation; if it
+/// returns true, then after doing this you should at some point do
+/// RenumberComputation(), which will remove any now-unused members of
+/// computation->indexes.
+bool SnipRowOps(NnetComputation *computation);
+
+/// This function detects submatrices and matrices that are never used (e.g. due
+/// to changes made in other optimization code), and members of indexes,
+/// indexes_multi and indexes_ranges that are unused or are duplicates, and
+/// removes them from the computation by way of suitable renumbering.  It does
+/// not remove no-ops from computation->commands_; to do that, call
+/// RemoveNoOps(computation).
 void RenumberComputation(NnetComputation *computation);
 
 /// Removes commands of type kNoOperation in the computation.
 void RemoveNoOps(NnetComputation *computation);
 
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation);
-
-/// A helper function used in some optimization functions.
-/// Wherever matrix orig_matrix_index appears in the output of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInOutput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation);
-
 /// This function outputs to "submatrix_args" the addresses of a subset of
 /// arguments arg1 through arg6 in "command", that correspond to the indexes of
 /// submatrices.  This is useful in renumbering code.  Note: some of the
@@ -620,7 +464,6 @@ void IdentifySubmatrixArgs(NnetComputation::Command *command,
 void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
                            std::vector<int32*> *submatrix_args);
 
-
 /// This function outputs to "submatrix_args" the addresses of integers in
 /// 'computation' that correspond to submatrices.  These may be present in
 /// 'commands', and in 'indexes_multi'.  This is useful in renumbering code.
@@ -631,32 +474,6 @@ void IdentifySubmatrixArgsInComputation(NnetComputation *computation,
                                         std::vector<int32*> *submatrix_args);
 
 
-/// This function outputs to "matrix_args" the addresses of a subset of the
-/// arguments arg1 through arg6 in "command", that correspond to the indexes of
-/// matrices.  This is useful in renumbering code.  (Note: only a few types of
-/// command use matrix indexes).
-void IdentifyMatrixArgs(NnetComputation::Command *command,
-                        std::vector<int32*> *matrix_args);
-
-/// This function outputs to "matrix_args" the addresses of a subset of the
-/// arguments arg1 through arg6 in commands in "commands", that correspond to
-/// the indexes of matrices.  This is useful in renumbering code.  (Note: only a
-/// few types of command use matrix indexes).
-void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *command,
-                        std::vector<int32*> *matrix_args);
-
-/// This function outputs to "matrix_args" the addresses of indexes inside
-/// 'computation' that correspond to matrices.  These live inside
-/// computation->commands and computation->input_output_info; and if
-/// 'include_from_submatrices' is true, then the matrix-indexes present in
-/// computation->submatrices[*].matrix_index will be included too.  Zeros may be
-/// present if there were optional arguments; we do include pointers to them,
-/// but you can just ignore them.
-void IdentifyMatrixArgsInComputation(bool include_from_submatrices,
-                                     NnetComputation *computation,
-                                     std::vector<int32*> *matrix_args);
-
-
 /// Identifies in the vector of commands, arguments that correspond to indexes
 /// into the computation's indexes_multi array, and outputs a list of pointers
 /// to those arguments to 'indexes_multi_args'.  Useful in renumbering code.
@@ -681,7 +498,26 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
-
+/// This function tries to optimize computation 'computation' for an 'looped'
+/// computation.  It expects as input a computation with no backprop but with
+/// multiple 'segments' separated by command kNoOperation, where each segment
+/// corresponds to a new chunk of input and output.  It tries to locate a pair
+/// of segment boundaries, with command indexes c1 and c2, where the active
+/// matrices have the same debug-info other than a time offset and can be
+/// identified with each other, and the no-op command at c2 can be replaced with
+/// 'got c1', creating a computation that 'goes on forever'.
+/// If it can't do this, it does nothing.  You can figure out that this is the
+/// case by checking whether kGotoLabel is the last command in the computation.
+/// [If this optimization fails, the whole computation may have to be
+/// regenerated with more segments.]
+void OptimizeLoopedComputation(const Nnet &nnet,
+                               NnetComputation *computation);
+
+
+/// This function ensures that the arg1 of a final command of type kGotoLabel is
+/// the same as the command with type kNoOperationLabel.  This is necessary
+/// if you do any other type of optimization after 'OptimizeLoopedComputation()'.
+void FixGotoLabel(NnetComputation *computation);
 
 
 /*
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 9d6ff739768..c0c03a13ab5 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -34,7 +34,13 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &propagate_in_place);
   ExpectToken(is, binary, "<BackpropInPlace>");
   ReadBasicType(is, binary, &backprop_in_place);
-  ExpectToken(is, binary, "<ConvertAddition>");
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<OptimizeRowOps>") {
+    ReadBasicType(is, binary, &optimize_row_ops);
+    ReadToken(is, binary, &tok);
+  }
+  KALDI_ASSERT(tok == "<ConvertAddition>");
   ReadBasicType(is, binary, &convert_addition);
   ExpectToken(is, binary, "<RemoveAssignments>");
   ReadBasicType(is, binary, &remove_assignments);
@@ -52,7 +58,6 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &min_deriv_time);
   ExpectToken(is, binary, "<MaxDerivTime>");
   ReadBasicType(is, binary, &max_deriv_time);
-  std::string tok;
   ReadToken(is, binary, &tok);
   if (tok == "<MaxDerivTimeRelative>") {
     ReadBasicType(is, binary, &max_deriv_time_relative);
@@ -73,6 +78,8 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, propagate_in_place);
   WriteToken(os, binary, "<BackpropInPlace>");
   WriteBasicType(os, binary, backprop_in_place);
+  WriteToken(os, binary, "<OptimizeRowOps>");
+  WriteBasicType(os, binary, optimize_row_ops);
   WriteToken(os, binary, "<ConvertAddition>");
   WriteBasicType(os, binary, convert_addition);
   WriteToken(os, binary, "<RemoveAssignments>");
@@ -193,9 +200,8 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet,
       continue;  // nothing to do.
     if (computation->commands[allocate_command].command_type !=
         kAllocMatrixZeroed) {
-      KALDI_ASSERT(computation->commands[allocate_command].command_type ==
-                   kAllocMatrixUndefined);
-      continue;  // already leaving it undefined, so nothing to do.
+      continue;  // already leaving it undefined, or it's an input, so nothing
+                 // to do.
     }
     std::vector<int32> variables_for_matrix;
     a.variables.AppendVariablesForMatrix(matrix_index, &variables_for_matrix);
@@ -294,7 +300,8 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
     if (command.command_type == kAllocMatrixZeroed ||
         command.command_type == kAllocMatrixUndefined ||
         command.command_type == kDeallocMatrix) {
-      int32 m = command.arg1, num_rows = computation->matrices[m].num_rows,
+      int32 s = command.arg1, m = computation->submatrices[s].matrix_index,
+          num_rows = computation->matrices[m].num_rows,
           num_cols = computation->matrices[m].num_cols,
           num_cols_mod = num_cols * (
               computation->matrices[m].stride_type == kDefaultStride ? 1 : -1);
@@ -336,33 +343,22 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
           kAllocMatrixFromOtherZeroed;
   }
   RemoveNoOps(computation);
+  FixGotoLabel(computation);
 }
 
 
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
-                                 const ComputationRequest &request,
                                  NnetComputation *computation) {
   bool changed = true;
   while (changed) {
     changed = false;
-    VariableMergingOptimizer opt(config, nnet, request, computation);
+    VariableMergingOptimizer opt(config, nnet, computation);
     if (opt.MergeVariables())
       changed = true;
   }
 }
 
-// This is a simplified top-level interface to the model-update consolidation
-// code from class ModelUpdateConsolidator.
-void ConsolidateModelUpdate(const Nnet &nnet,
-                            const ComputationRequest &request,
-                            NnetComputation *computation) {
-  if (!request.need_model_derivative)
-    return;   // An optimization; there would be nothing to do in this case.
-  ModelUpdateConsolidator consolidator(nnet, computation);
-  consolidator.ConsolidateModelUpdate();
-}
-
 
 void ConvertAdditionToAssignment(const Nnet &nnet,
                                  NnetComputation *computation) {
@@ -414,15 +410,30 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
   }
 }
 
+
+int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
+  int32 ans = std::numeric_limits<int32>::min();
+  for (size_t i = 0; i < request.outputs.size(); i++) {
+    const std::vector<Index> &indexes (request.outputs[i].indexes);
+    std::vector<Index>::const_iterator iter = indexes.begin(),
+        end = indexes.end();
+    for (; iter != end; ++iter)
+      if (iter->t > ans)
+        ans = iter->t;
+  }
+  if (ans == std::numeric_limits<int32>::min()) {
+    KALDI_ERR << "Failed to find any output indexes in computation request.";
+  }
+  return ans;
+}
+
+
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              const ComputationRequest &request,
+              int32 max_output_time_in_request,
               NnetComputation *computation) {
-  if (!config.optimize)
-    return;
-
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+    CheckComputation(nnet, *computation, true);
 
   { // Call LimitDerivativeTimes().
     // this will do nothing unless --min-deriv-time or --max-deriv-time
@@ -430,50 +441,91 @@ void Optimize(const NnetOptimizeOptions &config,
     int32 max_deriv_time = config.max_deriv_time;
     if (config.max_deriv_time_relative != std::numeric_limits<int32>::max())
       max_deriv_time = config.max_deriv_time_relative +
-          MaxOutputTimeInRequest(request);
+          max_output_time_in_request;
     LimitDerivativeTimes(nnet, config.min_deriv_time,
                          max_deriv_time, computation);
   }
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+    CheckComputation(nnet, *computation, true);
 
-  if (config.consolidate_model_update)
-    ConsolidateModelUpdate(nnet, request, computation);
+  if (config.optimize && config.consolidate_model_update)
+    ConsolidateModelUpdate(nnet, computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+    CheckComputation(nnet, *computation, true);
 
-  if (config.convert_addition)
+  if (config.optimize && config.convert_addition) {
     ConvertAdditionToAssignment(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, true);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+  if (config.optimize &&
+      (config.remove_assignments || config.backprop_in_place ||
+       config.propagate_in_place)) {
+    VariableMergingOptimization(config, nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (config.remove_assignments || config.backprop_in_place ||
-      config.propagate_in_place)
-    VariableMergingOptimization(config, nnet, request, computation);
+  if (config.optimize && config.optimize_row_ops) {
+    if (ReplaceRowWithMatrixOps(computation)) {
+      // if anything was changed...
+
+      // We have to call RenumberComputation() to get rid of any removed
+      // indexes... actually this could be a little wasteful, but unfortunately
+      // it doesn't seem like we'd otherwise be doing any renumbering past this
+      // point.
+      RenumberComputation(computation);
+      if (GetVerboseLevel() >= 4)
+        CheckComputation(nnet, *computation, false);
+    }
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
 
-  if (config.initialize_undefined)
+  if (config.optimize && config.initialize_undefined) {
     RemoveUnnecessaryZeroing(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
-
-  if (config.move_sizing_commands)
+  if (config.optimize && config.move_sizing_commands) {
     MoveSizingCommands(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+  // the looped computation optimization has to go before
+  // 'RemoveUnnecessaryAllocation()'.  We don't gate this by 'config.optimize'
+  // because it's necessary for looped computation to run.
+  if (config.optimize_looped_computation){
+    OptimizeLoopedComputation(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (config.allocate_from_other)
+  if (config.optimize && config.allocate_from_other &&
+      !config.optimize_looped_computation) {
+    // Don't do this if it's an looped computation because we're not sure if it
+    // would be correct in that case, as written.  In any case the performance
+    // benefit is tiny.
     RemoveUnnecessaryAllocation(nnet, computation);
+    if (GetVerboseLevel() >= 4)
+      CheckComputation(nnet, *computation, false);
+  }
+
+  // The following is not configurable because it is necessary for
+  // the computation to run correctly (we do it after compilation too,
+  // but the operations may have been put out of order by
+  // other optimizations.)
+  ConsolidateIoOperations(nnet, computation);
+
+  if (config.optimize_looped_computation)
+    FixGotoLabel(computation);
 
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+    CheckComputation(nnet, *computation, false);
 }
 
 // ComputationRequests are distinguished by the names and indexes
@@ -499,32 +551,32 @@ size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spe
                   // it makes the hasher faster.
   StringHasher string_hasher;
   ans = string_hasher(spec.name);
-  std::vector<Index>::const_iterator itr = spec.indexes.begin(),
+  std::vector<Index>::const_iterator iter = spec.indexes.begin(),
       end = spec.indexes.end(),
       med = end;
-  if (med > itr + n)
+  if (med > iter + n)
     med = iter + n;
 
-  for (; itr != med; ++itr) {
-    ans += (*itr).n * 1619;
-    ans += (*itr).t * 15649;
-    ans += (*itr).x * 89809;
+  for (; iter != med; ++iter) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
   }
   // after the first 'n' values, look only at every n'th value.  this makes the
   // hashing much faster, and in the kinds of structures that we actually deal
   // with, we shouldn't get unnecessary hash collisions as a result of this
   // optimization.
-  for (; iter < end; itr += n) {
-    ans += (*itr).n * 1619;
-    ans += (*itr).t * 15649;
-    ans += (*itr).x * 89809;
+  for (; iter < end; iter += n) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
   }
   return ans;
 }
 
 void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
                                             NnetComputation *computation) {
-  if (computation_cache_.size() == cache_capacity_) {
+  if (computation_cache_.size() == config_.cache_capacity) {
     // full, locate the least-recently-accessed request
     const CacheType::iterator it =
         computation_cache_.find(access_queue_.front());
@@ -624,7 +676,9 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
       ComputationChecker checker(check_config, nnet_, *computation);
       checker.Check();
     }
-    Optimize(opt_config_, nnet_, *request, computation);
+    Optimize(opt_config_, nnet_,
+             MaxOutputTimeInRequest(*request),
+             computation);
     if (GetVerboseLevel() >= verbose_cutoff) {
       std::ostringstream os;
       computation->Print(os, nnet_);
@@ -645,6 +699,173 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
   return computation;
 }
 
+/// Split the computation up into segments bounded by kNoOperationMarker.  For
+/// each segment, a pair of command-indexes (start, end) is output to the vector
+/// 'segments', so the commands in the segment (not including
+/// kNoOperationMarker) are numbered from start ... end - 1.
+static void SplitComputationIntoSegments(
+    const NnetComputation &computation,
+    std::vector<std::pair<int32, int32> > *segments) {
+
+  int32 num_commands = computation.commands.size();
+  segments->clear();
+  int32 cur_start = 0;
+  for (int32 c = 0; c < num_commands; c++) {
+    if (computation.commands[c].command_type == kNoOperationMarker) {
+      segments->push_back(std::pair<int32, int32>(cur_start, c));
+      cur_start = c + 1;
+    }
+  }
+  segments->push_back(std::pair<int32, int32>(cur_start, num_commands));
+}
+
+// This is a helper function used in ConsolidateIoOperations().
+//
+// Suppose we had something like this before ConsolidateIoOperations() (as would
+// be printed by Print()
+
+//  c90: output m50 to user [for node: 'output']
+//  ...
+//  c100: [label for goto statement]
+//  c101: # computation segment separator [e.g., begin backward commands]
+//  ...
+//  c105: m62 = user input [for node: 'input']
+//  ...
+//  c190: output m79 to user [for node: 'output']
+//  ...
+//  c200: goto c100
+//
+//  this would get reordered to the following by ConsolidateIoOperations
+//  (the bulk of the code, before this function is called):
+//
+//  c99: [label for goto statement]
+//  c100: output m50 to user [for node: 'output']
+//  c101: # computation segment separator [e.g., begin backward commands]
+//  c102: m62 = user input [for node: 'input']
+//  ...
+//  c199: goto c199
+//  c200: output m79 to user [for node: 'output']
+//
+// Now command c200 is unreachable, but there is a similar command at c100
+// (after the goto) that will substitute.  However, the matrix indexes are different.
+// So we need to change the above so that the last two commands read:
+//  c199: m50.swap(m79}
+//  c200: goto c199
+void FixGotoOutputReordering(const Nnet &nnet,
+                             NnetComputation *computation) {
+  FixGotoLabel(computation);  // make sure the destination label of the goto statement was
+                              // correct.
+  int32 goto_command_index = -1;
+  for (int32 c = computation->commands.size(); c >= 0; c--)
+    if (computation->commands[c].command_type == kGotoLabel)
+      goto_command_index = c;
+  KALDI_ASSERT(goto_command_index > 0);
+  int32 goto_label_index = computation->commands[goto_command_index].arg1;
+
+  std::vector<int32> output_commands_after_goto,
+      output_commands_after_label;
+  for (int32 c = goto_command_index + 1;
+       c < static_cast<int32>(computation->commands.size()); c++) {
+    KALDI_ASSERT(computation->commands[c].command_type == kProvideOutput);
+    output_commands_after_goto.push_back(c);
+  }
+  for (int32 c = goto_label_index + 1;
+       c < goto_command_index; c++) {  // note: we break from this loop.
+    CommandType t = computation->commands[c].command_type;
+    if (t == kProvideOutput)
+      output_commands_after_label.push_back(c);
+    else if (t != kNoOperationMarker && t != kAcceptInput)
+      break;
+  }
+  if (output_commands_after_goto.size() != output_commands_after_label.size()) {
+    computation->Print(std::cerr, nnet);
+    KALDI_ERR << "Could not fix goto/output reordering, size mismatch.";
+  }
+  NnetComputation::Command goto_command = computation->commands[goto_command_index];
+  // be we'll be replacing the final kProvideOutput commands with
+  // kAllocMatrixFromOther [i.e. swap commands], and moving them one command
+  // backward; later we'll put the goto command at the end.
+  for (size_t i = 0; i < output_commands_after_goto.size(); i++) {
+    int32 c1 = output_commands_after_label[i],
+        c2 = output_commands_after_goto[i],
+        new_c2 = c2 - 1;
+    int32 s1 = computation->commands[c1].arg1,
+        s2 = computation->commands[c2].arg1;
+    // The following assert checks that the network node-index is the same...
+    // the idea is that the outputs should have been provided in the same order.
+    // I can think of no reason why the order might be different.
+    KALDI_ASSERT(computation->commands[c1].arg2 ==
+                 computation->commands[c1].arg2);
+    computation->commands[new_c2].command_type = kAllocMatrixFromOther;
+    computation->commands[new_c2].arg1 = s1;
+    computation->commands[new_c2].arg2 = s2;
+  }
+  // ... and move the goto command to the end.
+  computation->commands.back() = goto_command;
+}
+
+
+void ConsolidateIoOperations(const Nnet &nnet,
+                             NnetComputation *computation) {
+  bool ends_with_goto =
+      (!computation->commands.empty() &&
+       computation->commands.back().command_type == kGotoLabel);
+
+  // These segments, represented as (start-index, end-index),
+  // are segments of the computation separated by kNoOperationMarker.
+  std::vector<std::pair<int32, int32> > segments;
+  SplitComputationIntoSegments(*computation, &segments);
+
+  int32 num_commands = computation->commands.size();
+  std::vector<NnetComputation::Command> reordered_commands(num_commands);
+  // put kNoOperationMarker between all segments in the reordered commands.
+  for (size_t s = 0; s + 1 < segments.size(); s++)
+    reordered_commands[segments[s].second].command_type = kNoOperationMarker;
+
+  // for each segment we'll divide the commands up into those that must appear
+  // at the left of the segment (kAcceptInput for inputs and output-derivs), those
+  // that must appear in the middle (most commands), those that must appear
+  // on the right (kProvideOutput for output nodes and input derivatives).
+  std::vector<int32> left_commands, middle_commands, right_commands;
+
+  for (size_t s = 0; s < segments.size(); s++) {
+    int32 segment_start = segments[s].first,
+        segment_end = segments[s].second;
+    left_commands.clear();
+    middle_commands.clear();
+    right_commands.clear();
+    for (int32 c = segment_start; c < segment_end; c++) {
+      if (computation->commands[c].command_type == kProvideOutput) {
+        right_commands.push_back(c);
+      } else if (computation->commands[c].command_type == kAcceptInput) {
+        left_commands.push_back(c);
+      } else {
+        middle_commands.push_back(c);
+      }
+    }
+    std::vector<int32>::const_iterator iter = left_commands.begin(),
+        end = left_commands.end();
+    int32 c = segment_start;
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    iter = middle_commands.begin();
+    end = middle_commands.end();
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    iter = right_commands.begin();
+    end = right_commands.end();
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    KALDI_ASSERT(c == segment_end);
+  }
+  computation->commands.swap(reordered_commands);
+
+  if (ends_with_goto)
+    FixGotoOutputReordering(nnet, computation);
+}
+
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 732f11e29ac..0df50b329a9 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-optimize.h
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-//                2015  Xiaohui Zhang
+// Copyright      2015-2016  Johns Hopkins University (author: Daniel Povey)
+//                2015       Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -37,6 +37,7 @@ struct NnetOptimizeOptions {
   bool consolidate_model_update;
   bool propagate_in_place;
   bool backprop_in_place;
+  bool optimize_row_ops;
   bool convert_addition;
   bool remove_assignments;
   bool allow_left_merge;
@@ -47,12 +48,17 @@ struct NnetOptimizeOptions {
   int32 min_deriv_time;
   int32 max_deriv_time;
   int32 max_deriv_time_relative;
+  // optimize_looped_computation is a 'hidden config' not available from
+  // the command line; it's set to true to enable the optimization for
+  // looped computation that turns a linear computation into a loop.
+  bool optimize_looped_computation;
 
   NnetOptimizeOptions():
       optimize(true),
       consolidate_model_update(true),
       propagate_in_place(true),
       backprop_in_place(true),
+      optimize_row_ops(true),
       convert_addition(true),
       remove_assignments(true),
       allow_left_merge(true),
@@ -62,7 +68,8 @@ struct NnetOptimizeOptions {
       allocate_from_other(true),
       min_deriv_time(std::numeric_limits<int32>::min()),
       max_deriv_time(std::numeric_limits<int32>::max()),
-      max_deriv_time_relative(std::numeric_limits<int32>::max()) {}
+      max_deriv_time_relative(std::numeric_limits<int32>::max()),
+      optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "
@@ -75,6 +82,9 @@ struct NnetOptimizeOptions {
                    "disable optimization that allows in-place propagation");
     opts->Register("backprop-in-place", &backprop_in_place, "Set to false to "
                    "disable optimization that allows in-place backprop");
+    opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to "
+                   "disable certain optimizations that act on operations of "
+                   "type *Row*.");
     opts->Register("convert-addition", &convert_addition, "Set to false to "
                    "disable the optimization that converts Add commands into "
                    "Copy commands wherever possible.");
@@ -114,10 +124,39 @@ struct NnetOptimizeOptions {
   bool operator == (const NnetOptimizeOptions &other) const;
 };
 
-/// This is the top-level function for optimizing a computation.
+
+/* This utility function, used in code that calls LimitDerivativeTimes() (and
+   required in code that calls Optimize(), returns the largest time
+   't' in any of the 'outputs' in the computation request, or crashes if there
+   are no outputs (or no cindexes in those outputs). */
+int32 MaxOutputTimeInRequest(const ComputationRequest &request);
+
+
+/** This is the top-level function for optimizing a computation.  Note: it
+    should really be called OptimizeAndPostprocess(), because there is at least
+    one thing it does (reordering I/O commands) that is necessary for a
+    computation to be run.
+
+    @param [in] config   The options that control, among other things,
+                         which optimizations to apply.
+    @param [in] nnet     The neural net for which the computation is being built
+    @param [in] max_output_time_in_request  This value is only needed when the
+                         max-deriv-time-relative config value is set in
+                         'config'.  It should be set to the largest 't' value
+                         encountered in any of the indexes in the 'output'
+                         IoSpecifications in the ComputationRequests used to
+                         compile the computation.  However if there are multiple
+                         ComputationRequests (i.e. it was an online computation)
+                         you can just set it to any value you want, because
+                         backpropagation is not supported so the
+                         max-deriv-time-relative configuration value would not
+                         have any effect.
+    @param [in,out] computation  The computation to be optimized; this function
+                         modifies it in-place.
+ */
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              const ComputationRequest &request,
+              int32 max_output_time_in_request,
               NnetComputation *computation);
 
 // Hash function for ComputationRequest. It converts
@@ -146,8 +185,6 @@ struct CachingOptimizingCompilerOptions {
   int32 write_cache;
   int32 cache_capacity;
 
-
-
   CachingOptimizingCompilerOptions():
       use_shortcut(true),
       cache_capacity(64) { }
@@ -172,13 +209,15 @@ struct CachingOptimizingCompilerOptions {
 class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
-                            const CachingOptimizingCompilerOptions &config):
-      nnet_(nnet), config_(config), cache_capacity_(capacity) { }
+                            const CachingOptimizingCompilerOptions config =
+                            CachingOptimizingCompilerOptions()):
+      nnet_(nnet), config_(config) { }
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
-                            const CachingOptimizingCompilerOptions &config):
+                            const CachingOptimizingCompilerOptions config =
+                            CachingOptimizingCompilerOptions()):
       nnet_(nnet), config_(config), opt_config_(opt_config) { }
 
   ~CachingOptimizingCompiler();
@@ -219,9 +258,6 @@ class CachingOptimizingCompiler {
                    NnetComputation *computation);
   // This function updates the recently accessed queue.
   void UpdateAccessQueue(CacheType::iterator &cit);
-  // This configuration value determines how many unique Computations
-  // to cache in our most-recently-used cache.
-  int32 cache_capacity_;
 };
 
 
@@ -265,7 +301,6 @@ void LimitDerivativeTimes(const Nnet &nnet,
 /// class ModelUpdateConsolidator.  Will fail if called a
 /// second time.
 void ConsolidateModelUpdate(const Nnet &nnet,
-                            const ComputationRequest &request,
                             NnetComputation *computation);
 
 /// This converts addition operations (things with Add in their names) to
@@ -278,7 +313,6 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
 /// This wraps class VariableMergingOptimizer in a simplified interface.
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
-                                 const ComputationRequest &request,
                                  NnetComputation *computation);
 
 
@@ -298,6 +332,17 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
                                  NnetComputation *computation);
 
 
+/// This optimization puts the input operations (kAcceptInput) and output
+/// operations (kProvideOutput) at the very beginning or end of segments of
+/// computation, respectively.
+///
+/// This is actually necessary for computations to be run easily, because if these
+/// commands were interspersed with the regular commands, you'd have to
+/// call computer.Run() between the individual AcceptInput() and GetOutput()
+/// function calls.
+void ConsolidateIoOperations(const Nnet &nnet,
+                             NnetComputation *computation);
+
 
 
 } // namespace nnet3

From b1cb7d336ecac462e3ae972fe17785c23b5036dd Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Wed, 14 Dec 2016 17:08:13 -0800
Subject: [PATCH 024/213] Getting shortcut compilation to the point where it's
 testable (test failing thouth)

---
 src/nnet3/nnet-common.h          |   6 +-
 src/nnet3/nnet-optimize-test.cc  |  78 ++++++++++++++----
 src/nnet3/nnet-optimize-utils.cc | 124 ++++++++++++++++++++++++++++
 src/nnet3/nnet-optimize-utils.h  |   6 +-
 src/nnet3/nnet-optimize.cc       | 137 +++++++++++++++++++++----------
 src/nnet3/nnet-optimize.h        |  36 ++++++--
 6 files changed, 316 insertions(+), 71 deletions(-)

diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index e6e3abe705e..f76166c0758 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -55,9 +55,9 @@ struct Index {
   bool operator < (const Index &a) const {
     if (t < a.t) { return true; }
     else if (t > a.t) { return false; }
-    else if (n < a.n) { return true; }
-    else if (n > a.n) { return false; }
-    else return (x < a.x);
+    else if (x < a.x) { return true; }
+    else if (x > a.x) { return false; }
+    else return (n < a.n);
   }
   Index operator + (const Index &other) const {
     return Index(n+other.n, t+other.t, x+other.x);
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 40f8d824a39..1a8a00e3abf 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -27,9 +27,12 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Run the test wothout optimizations and with optimizations specified by the
-// parameter. Only print warnings; we'll fail the whole test later.
-static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
+// Run the test without optimizations and with optimizations specified by the
+// configs (the optimized version is done with class CachingOptimizingCompiler).
+// Only print warnings; we'll fail the whole test later.
+static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config,
+                                            CachingOptimizingCompilerOptions compiler_config) {
+
   //opt_config.convert_addition = false;
   //opt_config.remove_assignments = false;
   //opt_config.move_sizing_commands = false;
@@ -60,7 +63,7 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     {
       std::ostringstream os;
       computation.Print(os, nnet);
-      KALDI_LOG << "Generated computation is: " << os.str();
+      KALDI_LOG << "Generated computation with no optimization or shortcut is: " << os.str();
     }
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
@@ -68,12 +71,11 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
 
-    NnetComputation computation_opt(computation);
+    CachingOptimizingCompiler opt_compiler(nnet, opt_config, compiler_config);
+
+    const NnetComputation &computation_opt = *opt_compiler.Compile(request);
 
     {
-      Optimize(opt_config, nnet,
-               MaxOutputTimeInRequest(request),
-               &computation_opt);
       std::ostringstream os;
       computation_opt.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
@@ -84,7 +86,8 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
       compute_opts.debug = true;
 
     computation.ComputeCudaIndexes();
-    computation_opt.ComputeCudaIndexes();
+    // computation_opt has already had this function called.
+
     Nnet nnet_to_update(nnet);  // copy of the nnet that we update...  needed to
                                 // test the consolidation of backprop commands,
                                 // otherwise the optimized and non-optimized
@@ -179,6 +182,8 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
 // the outputs are the same.
 static void UnitTestNnetOptimize() {
   NnetOptimizeOptions optimize_all;
+  CachingOptimizingCompilerOptions compiler_all;
+
   // randomly sometimes set min_deriv and max_deriv to small/large values,
   // which will cause some of the LimitDerivativeTimes() code to be called
   // (without really changing anything).
@@ -187,44 +192,83 @@ static void UnitTestNnetOptimize() {
 
   // this is useful for debugging as it removes nans:
   // optimize_all.initialize_undefined = false;
-  bool success = UnitTestNnetOptimizeWithOptions(optimize_all);
+  bool success = UnitTestNnetOptimizeWithOptions(optimize_all,
+                                                 compiler_all);
   if (success)
     return;
 
   // Test failed with full optimization. Slowly retry with various
   // optimizations switched off.
   NnetOptimizeOptions optimize = optimize_all;
-  optimize.propagate_in_place = false;
-  bool succ_no_propagate_in_place = UnitTestNnetOptimizeWithOptions(optimize);
+  CachingOptimizingCompilerOptions compiler = compiler_all;
+
 
+  compiler.use_shortcut = false;
+  bool succ_no_shortcut = UnitTestNnetOptimizeWithOptions(optimize,
+                                                          compiler);
+  compiler = compiler_all;
+
+
+  optimize.propagate_in_place = false;
+  bool succ_no_propagate_in_place = UnitTestNnetOptimizeWithOptions(optimize,
+                                                                    compiler);
   optimize = optimize_all;
+
   optimize.backprop_in_place = false;
-  bool succ_no_backprop_in_place = UnitTestNnetOptimizeWithOptions(optimize);
+  bool succ_no_backprop_in_place = UnitTestNnetOptimizeWithOptions(optimize,
+                                                                   compiler);
+  optimize = optimize_all;
 
+  optimize.optimize_row_ops = false;
+  bool succ_no_row_ops = UnitTestNnetOptimizeWithOptions(optimize,
+                                                         compiler);
   optimize = optimize_all;
-  optimize.remove_assignments = false;
-  bool succ_no_remove_assignments = UnitTestNnetOptimizeWithOptions(optimize);
 
+  optimize.convert_addition = false;
+  bool succ_no_convert_addition = UnitTestNnetOptimizeWithOptions(optimize,
+                                                                  compiler);
   optimize = optimize_all;
+
+  optimize.remove_assignments = false;
+  bool succ_no_remove_assignments = UnitTestNnetOptimizeWithOptions(optimize,
+                                                                    compiler);
+  optimize = optimize_all;
+
   optimize.initialize_undefined = false;
-  bool succ_no_initialize_undefined = UnitTestNnetOptimizeWithOptions(optimize);
+  bool succ_no_initialize_undefined = UnitTestNnetOptimizeWithOptions(optimize,
+                                                                      compiler);
+  optimize = optimize_all;
 
+  optimize.allocate_from_other = false;
+  bool succ_no_allocate_from_other = UnitTestNnetOptimizeWithOptions(optimize,
+                                                                     compiler);
   optimize = optimize_all;
+
   optimize.move_sizing_commands = false;
-  bool succ_no_move_sizing_commands = UnitTestNnetOptimizeWithOptions(optimize);
+  bool succ_no_move_sizing_commands = UnitTestNnetOptimizeWithOptions(optimize,
+                                                                      compiler);
+  optimize = optimize_all;
 
 #define KALDI_SUCCFAIL(b) ((b) ? "SUCCESS" : "FAILURE")
   KALDI_ERR
     << "Test failed with all optimizations enabled. Retried test with the "
     << "following optimizations turned off:"
+    << "\n  use_shortcut         ... " << KALDI_SUCCFAIL(succ_no_shortcut)
     << "\n  propagate_in_place   ... " << KALDI_SUCCFAIL(succ_no_propagate_in_place)
     << "\n  backprop_in_place    ... " << KALDI_SUCCFAIL(succ_no_backprop_in_place)
+    << "\n  optimize_row_ops     ... " << KALDI_SUCCFAIL(succ_no_row_ops)
+    << "\n  convert_addition     ... " << KALDI_SUCCFAIL(succ_no_convert_addition)
     << "\n  remove_assignments   ... " << KALDI_SUCCFAIL(succ_no_remove_assignments)
     << "\n  initialize_undefined ... " << KALDI_SUCCFAIL(succ_no_initialize_undefined)
+    << "\n  allocate_from_other  ... " << KALDI_SUCCFAIL(succ_no_allocate_from_other)
     << "\n  move_sizing_commands ... " << KALDI_SUCCFAIL(succ_no_move_sizing_commands);
 #undef KALDI_SUCCFAIL
 }
 
+
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 62bda3a17e1..de25b8bcabb 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -2635,6 +2635,7 @@ void ComputationExpander::InitFastInfo() {
     // 'n' value be zero.
     KALDI_ASSERT(debug_info.cindexes[0].second.n == 0);
     bool is_fast = (debug_info.cindexes[1].second.n == 1);
+    n_fast_[m] = is_fast;
 
     bool do_check = (RandInt(0, 2) == 0);
     if (do_check) {
@@ -2983,6 +2984,129 @@ void ComputationExpander::GetNewLocationInfo(
   }
 }
 
+
+void ExpandComputation(const Nnet &nnet,
+                       const MiscComputationInfo &misc_info,
+                       const NnetComputation &computation,
+                       bool need_debug_info,
+                       int32 num_n_values,
+                       NnetComputation *expanded_computation) {
+  ComputationExpander expander(nnet, misc_info, computation,
+                               need_debug_info, num_n_values,
+                               expanded_computation);
+  expander.Expand();
+}
+
+
+
+// This helper function is used in RequestIsDecomposable(); you can work out
+// what it does, and why, from the documentation of RequestIsDecomposable() in
+// the header.
+static bool IoSpecificationIsDecomposable(const IoSpecification &io_spec,
+                                          IoSpecification *mini_io_spec,
+                                          int32 *num_n_values_out) {
+  mini_io_spec->name = io_spec.name;
+  mini_io_spec->has_deriv = io_spec.has_deriv;
+  const std::vector<Index> &indexes = io_spec.indexes;
+  KALDI_ASSERT(!indexes.empty() && "Empty Indexes in computation request");
+  // For a computation to be decomposable, the 'n' values need to vary from 0 to
+  // N-1 for some N > 2, and they need to be in some kind of regular order with
+  // suitable repetition-- either with the 'n' values varying the 'fastest', or
+  // the 'slowest' of all the indexes.
+  if (indexes[0].n != 0 || indexes.back().n < 2) {
+    return false;
+  }
+  int32 num_n_values = indexes.back().n + 1,
+      size = indexes.size();
+  *num_n_values_out = num_n_values;
+  if (size % num_n_values != 0)
+    return false;
+  bool n_fast = (indexes[1].n == 1);
+  // if 'n_fast' is true, then the n index varies the fastest (stride == 1),
+  // otherwise it varies the slowest of any index.  We require that it be one of
+  // these two options, otherwise we declare the computation to be
+  // non-decomposable.
+
+  mini_io_spec->indexes.resize((size / num_n_values) * 2);
+  if (n_fast) {
+    // 'block_size' is the size of blocks with the same x,t values, which are
+    // expected to have n values 0, 1, ... num_n_values - 1.
+    // of course each block is of size num_n_values.
+    int32 num_blocks = size / num_n_values;
+    const Index *indexes_ptr = &(indexes[0]);
+    Index *indexes_out = &(mini_io_spec->indexes[0]);
+    for (int32 block = 0; block < num_blocks; block++) {
+      *(indexes_out++) = indexes_ptr[0];  // for n == 0
+      *(indexes_out++) = indexes_ptr[1];  // for n == 1.
+
+      // we expect all the indexes in this block to have the same x and t
+      // values, but n values increasing from 0 to num_n_values - 1.
+      int32 t = indexes_ptr->t, x = indexes_ptr->x;
+
+      for (int32 n = 0; n < num_n_values; n++, indexes_ptr++) {
+        if (indexes_ptr->n != n || indexes_ptr->t != t || indexes_ptr->x != x)
+          return false;
+      }
+    }
+  } else {
+    // 'n' varies the slowest.
+    int32 block_size = size / num_n_values;
+    mini_io_spec->indexes.clear();
+    mini_io_spec->indexes.insert(mini_io_spec->indexes.end(),
+                                 indexes.begin(),
+                                 indexes.begin() + 2 * block_size);
+
+    // now verify that it has the expected structure...
+    for (int32 i = 0; i < block_size; i++) {
+      const Index *indexes_ptr = &(indexes[i]);
+      int32 t = indexes_ptr->t, x = indexes_ptr->x;
+      for (int32 n = 0; n < num_n_values; n++, indexes_ptr += block_size) {
+        if (indexes_ptr->n != n || indexes_ptr->t != t || indexes_ptr->x != x)
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool RequestIsDecomposable(const ComputationRequest &request,
+                           ComputationRequest *mini_request,
+                           int32 *num_n_values) {
+  size_t num_inputs = request.inputs.size(),
+      num_outputs = request.outputs.size();
+  mini_request->inputs.resize(num_inputs);
+  mini_request->outputs.resize(num_outputs);
+  mini_request->need_model_derivative = request.need_model_derivative;
+  mini_request->store_component_stats = request.store_component_stats;
+  mini_request->misc_info = request.misc_info;
+
+  KALDI_ASSERT(num_inputs != 0 && num_outputs != 0);
+  for (size_t i = 0; i < num_inputs; i++) {
+    int32 this_num_n_values = 0;
+    if (!IoSpecificationIsDecomposable(request.inputs[i],
+                                       &(mini_request->inputs[i]),
+                                       &this_num_n_values))
+      return false;
+    if (i == 0) {
+      *num_n_values = this_num_n_values;
+    } else {
+      if (this_num_n_values != *num_n_values)
+        return false;  // .. which would be odd.
+    }
+  }
+  for (size_t i = 0; i < num_outputs; i++) {
+    int32 this_num_n_values = 0;
+    if (!IoSpecificationIsDecomposable(request.outputs[i],
+                                       &(mini_request->outputs[i]),
+                                       &this_num_n_values))
+      return false;
+    if (this_num_n_values != *num_n_values)
+      return false;  // .. which would be odd.
+  }
+  return true;
+}
+
+
 class ComputationLoopedOptimizer {
  public:
   ComputationLoopedOptimizer(const Nnet &nnet,
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 9977ca8952a..aec8c21a368 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -379,9 +379,9 @@ void LimitDerivativeTimes(const Nnet &nnet,
             reason to the order of the t and x values; the regularity on 'n' is
             all that we care about.
  */
-bool ComputationIsDecomposable(const ComputationRequest &request,
-                               ComputationRequest *mini_request,
-                               int32 *num_n_values);  // TODO: implement this.
+bool RequestIsDecomposable(const ComputationRequest &request,
+                           ComputationRequest *mini_request,
+                           int32 *num_n_values);
 
 
 /**
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index c0c03a13ab5..c2cee31bbcc 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -575,7 +575,7 @@ size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spe
 }
 
 void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
-                                            NnetComputation *computation) {
+                                            const NnetComputation *computation) {
   if (computation_cache_.size() == config_.cache_capacity) {
     // full, locate the least-recently-accessed request
     const CacheType::iterator it =
@@ -647,58 +647,109 @@ CachingOptimizingCompiler::~CachingOptimizingCompiler() {
 
 const NnetComputation* CachingOptimizingCompiler::Compile(
     const ComputationRequest  &in_request) {
-  NnetComputation *computation;
-  ComputationRequest *request;
   // find computation in the cache
   CacheType::iterator cit = computation_cache_.find(&in_request);
   if (cit == computation_cache_.end()) {
-    // if not found, compile and update cache
-    request = new ComputationRequest;
-    *request = in_request;
-    Compiler compiler(*request, nnet_);
-    CompilerOptions opts;
-    computation = new NnetComputation;
-    compiler.CreateComputation(opts, computation);
-
-    int32 verbose_cutoff = 4;
-    if (GetVerboseLevel() >= verbose_cutoff) {
-      std::ostringstream os1;
-      request->Print(os1);
-      KALDI_LOG << "Computation request is " << os1.str();
-      std::ostringstream os2;
-      computation->Print(os2, nnet_);
-      KALDI_LOG << "Generated computation is: " << os2.str();
-    }
-    { // some checking.
-      CheckComputationOptions check_config;
-      // we can do the rewrite check since it's before optimization.
-      check_config.check_rewrite = true;
-      ComputationChecker checker(check_config, nnet_, *computation);
-      checker.Check();
-    }
-    Optimize(opt_config_, nnet_,
-             MaxOutputTimeInRequest(*request),
-             computation);
-    if (GetVerboseLevel() >= verbose_cutoff) {
-      std::ostringstream os;
-      computation->Print(os, nnet_);
-      KALDI_LOG << "Optimized computation is: " << os.str();
-    }
-    {  // check the computation again.
-      CheckComputationOptions check_config;
-      ComputationChecker checker(check_config, nnet_, *computation);
-      checker.Check();
-    }
-    computation->ComputeCudaIndexes();
-    UpdateCache(request, computation);
+    return CompileAndCache(in_request);
   } else {
     // if found, update access queue
-    computation = cit->second.first;
+    const NnetComputation *computation = cit->second.first;
     UpdateAccessQueue(cit);
+    return computation;
   }
+}
+
+const NnetComputation* CachingOptimizingCompiler::CompileAndCache(
+    const ComputationRequest  &in_request) {
+  // we need to make a copy of ComputationRequest, because it's stored
+  // as the key in the cache, and we need to own the pointer.
+  ComputationRequest *request = new ComputationRequest(in_request);
+
+  const NnetComputation *computation = CompileViaShortcut(*request);
+  if (computation == NULL)
+    computation = CompileNoShortcut(*request);
+  UpdateCache(request, computation);
+  return computation;
+}
+
+
+const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
+    const ComputationRequest &request) {
+
+  Compiler compiler(request, nnet_);
+  // note: 'opts' only contains 'output_debug_info', which is true by default.
+  // There may be situations where we'd prefer not to keep it, for speed.
+  CompilerOptions opts;
+  NnetComputation *computation = new NnetComputation;
+  compiler.CreateComputation(opts, computation);
+
+  int32 verbose_cutoff = 4;
+  if (GetVerboseLevel() >= verbose_cutoff) {
+    std::ostringstream os1;
+    request.Print(os1);
+    KALDI_LOG << "Computation request is " << os1.str();
+    std::ostringstream os2;
+    computation->Print(os2, nnet_);
+    KALDI_LOG << "Generated computation is: " << os2.str();
+  }
+  { // some checking.  Note: there may be a time when we might
+    // prefer not do to this checking.
+    CheckComputationOptions check_config;
+    // we can do the rewrite check since it's before optimization.
+    check_config.check_rewrite = true;
+    ComputationChecker checker(check_config, nnet_, *computation);
+    checker.Check();
+  }
+  Optimize(opt_config_, nnet_,
+           MaxOutputTimeInRequest(request),
+           computation);
+  if (GetVerboseLevel() >= verbose_cutoff) {
+    std::ostringstream os;
+    computation->Print(os, nnet_);
+    KALDI_LOG << "Optimized computation is: " << os.str();
+  }
+  {  // check the computation again.
+    CheckComputationOptions check_config;
+    ComputationChecker checker(check_config, nnet_, *computation);
+    checker.Check();
+  }
+  computation->ComputeCudaIndexes();
   return computation;
 }
 
+
+const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut(
+    const ComputationRequest &request) {
+  if (!config_.use_shortcut)
+    return NULL;
+
+  int32 num_n_values;
+  ComputationRequest mini_request;
+  if (!RequestIsDecomposable(request, &mini_request, &num_n_values))
+    return NULL;
+
+  // by invoking Compile() on the mini request, we go through the same
+  // caching process as for any externally requested computation.
+  // note: this pointer is not being 'given to us'... it's owned in
+  // the cache.
+  const NnetComputation *mini_computation = Compile(mini_request);
+
+  // note: by default we always create debug_info, even in regular compilation.
+  // (e.g. it defaults to true in CompilerOptions).  If it really seems to be a
+  // significant overhead, we can revisit this at some point in future.
+  bool need_debug_info = true;
+
+
+  NnetComputation *ans = new NnetComputation();
+
+  ExpandComputation(nnet_, request.misc_info, *mini_computation,
+                    need_debug_info, num_n_values, ans);
+
+  return ans;
+}
+
+
+
 /// Split the computation up into segments bounded by kNoOperationMarker.  For
 /// each segment, a pair of command-indexes (start, end) is output to the vector
 /// 'segments', so the commands in the segment (not including
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 0df50b329a9..1ca776d4ee6 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -182,7 +182,6 @@ struct ComputationRequestPtrEqual {
 
 struct CachingOptimizingCompilerOptions {
   bool use_shortcut;
-  int32 write_cache;
   int32 cache_capacity;
 
   CachingOptimizingCompilerOptions():
@@ -229,6 +228,32 @@ class CachingOptimizingCompiler {
   void ReadCache(std::istream &is, bool binary);
   void WriteCache(std::ostream &os, bool binary) const;
  private:
+  // This function, called from Compile(), is called when a ComputationRequest
+  // has been determined not to have already been cached.  It otherwise has the
+  // same interface as Compile(), but assumes that there is nothing cached for
+  // this computation as yet.  It compiles the computation and takes care of
+  // caching it.
+  const NnetComputation* CompileAndCache(const ComputationRequest &request);
+
+
+  // This function, called from CompileAndCache(), tries to compile the
+  // ComputationRequest 'request' via 'shortcut' compilation; if this is
+  // possible, it returns a pointer to a newly allocated computation that it has
+  // compiled this way (note: this computation will not yet have been placed in
+  // the computation cache).  If this is not possible for some reason
+  // (e.g. shortcut compilation is disabled in the config; or the computation
+  // request was not decomposable because of too few n values or irregular or
+  // unexpected structure), this function returns NULL and you should compile
+  // via CompileNoShortcut.
+  const NnetComputation* CompileViaShortcut(const ComputationRequest &request);
+
+  // This function, called from CompileAndCache(), tries to compile the
+  // ComputationRequest 'request' via the regular (not shortcut) compilation
+  // process; it returns a pointer to a newly allocated computation that it has
+  // compiled this way (note: this computation will not yet have been placed in
+  // the computation cache).
+  const NnetComputation* CompileNoShortcut(const ComputationRequest &request);
+
   const Nnet &nnet_;
   CachingOptimizingCompilerOptions config_;
   NnetOptimizeOptions opt_config_;
@@ -245,9 +270,10 @@ class CachingOptimizingCompiler {
   // Map from computation-request to pair of (computation, and position in
   // access_queue_). Used for fast lookup of previously compiled computations.
   // All pointers are owned here.
-  typedef unordered_map<const ComputationRequest*, std::pair<NnetComputation*,
-    AqType::iterator>, ComputationRequestHasher,
-    ComputationRequestPtrEqual> CacheType;
+  typedef unordered_map<const ComputationRequest*,
+                        std::pair<const NnetComputation*, AqType::iterator>,
+                        ComputationRequestHasher,
+                        ComputationRequestPtrEqual> CacheType;
   CacheType computation_cache_;
 
   // This function updates the computation cache. It is called within Compile().
@@ -255,7 +281,7 @@ class CachingOptimizingCompiler {
   // the queue, and purges the least-recently-accessed request from the queue and
   // the cache if the capacity is reached.
   void UpdateCache(const ComputationRequest *request,
-                   NnetComputation *computation);
+                   const NnetComputation *computation);
   // This function updates the recently accessed queue.
   void UpdateAccessQueue(CacheType::iterator &cit);
 };

From e44ba7797531522401f930e35e4b06fde17548f2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 15 Dec 2016 03:31:20 -0500
Subject: [PATCH 025/213] Fix various bugs in shortcut compilation; add further
 testing code

---
 src/nnet3/nnet-derivative-test.cc |  46 ++---
 src/nnet3/nnet-optimize-test.cc   | 280 +++++++++++++++---------------
 src/nnet3/nnet-optimize-utils.cc  |  86 ++++-----
 src/nnet3/nnet-optimize.cc        |   2 +
 4 files changed, 204 insertions(+), 210 deletions(-)

diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index 5dbc8a126d1..4289b577a25 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -95,7 +95,7 @@ void UnitTestNnetModelDerivatives() {
     //gen_config.allow_nonlinearity = false;
     //gen_config.allow_recursion = false;
     //gen_config.allow_final_nonlinearity = true;
-    bool allow_optimization = true;
+
     bool limit_deriv_times = (RandInt(0, 2) == 0);
 
     std::vector<std::string> configs;
@@ -118,44 +118,23 @@ void UnitTestNnetModelDerivatives() {
     // whether input-derivatives are required or not does not matter,
     // so leave it as it is in that regard.
 
-    NnetComputation computation;
-    Compiler compiler(request, nnet);
-
-    CompilerOptions opts;
-    compiler.CreateComputation(opts, &computation);
-    {
-      std::ostringstream os;
-      computation.Print(os, nnet);
-      KALDI_LOG << "Generated computation is: " << os.str();
+    NnetOptimizeOptions optimize_opts;
+    CachingOptimizingCompilerOptions compiler_opts;
+    if (limit_deriv_times) {
+      SetDerivTimesOptions(request, &optimize_opts);
     }
-    CheckComputationOptions check_config;
-    // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;
-    ComputationChecker checker(check_config, nnet, computation);
-    checker.Check();
 
-    if (RandInt(0, 3) != 0 && allow_optimization) {
-      NnetOptimizeOptions opt_config;
-      if (limit_deriv_times)
-        SetDerivTimesOptions(request, &opt_config);
+    CachingOptimizingCompiler compiler(nnet, optimize_opts,
+                                       compiler_opts);
 
-      Optimize(opt_config, nnet,
-               MaxOutputTimeInRequest(request),
-               &computation);
+    const NnetComputation &computation = *(compiler.Compile(request));
+
+    {
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
-      check_config.check_rewrite = false;
-      ComputationChecker checker_opt(check_config, nnet, computation);
-      checker_opt.Check();
     }
 
-    NnetComputeOptions compute_opts;
-    if (RandInt(0, 1) == 0)
-      compute_opts.debug = true;
-    computation.ComputeCudaIndexes();
-
-
     Nnet nnet_deriv(nnet);
     bool is_gradient = true;
     SetZero(is_gradient, &nnet_deriv);  // forces "simple" update and unit
@@ -179,6 +158,11 @@ void UnitTestNnetModelDerivatives() {
                         nnet.OutputDim("output"));
     output_deriv.SetRandn();
 
+
+    NnetComputeOptions compute_opts;
+    if (RandInt(0, 1) == 0)
+      compute_opts.debug = true;
+
     // pass 0 is the forward pass with the un-perturbed model.
     // Other passes are with various differently-perturbed versions of
     // the model.
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 1a8a00e3abf..0654683aa9c 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -30,7 +30,8 @@ namespace nnet3 {
 // Run the test without optimizations and with optimizations specified by the
 // configs (the optimized version is done with class CachingOptimizingCompiler).
 // Only print warnings; we'll fail the whole test later.
-static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config,
+static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed,
+                                            NnetOptimizeOptions opt_config,
                                             CachingOptimizingCompilerOptions compiler_config) {
 
   //opt_config.convert_addition = false;
@@ -38,149 +39,150 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config,
   //opt_config.move_sizing_commands = false;
   //opt_config.allocate_from_other = false;
 
-  srand(0);  // Every run must be deterministic.
-  for (int32 n = 0; n < 40; n++) {
-    struct NnetGenerationOptions gen_config;
-
-    std::vector<std::string> configs;
-    GenerateConfigSequence(gen_config, &configs);
-    Nnet nnet;
-    for (size_t j = 0; j < configs.size(); j++) {
-      KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
-      std::istringstream is(configs[j]);
-      nnet.ReadConfig(is);
-    }
+  srand(srand_seed);  // so that we can compare between differnt optimization types
+                      // with the randomly generated network staying the same.
 
-    ComputationRequest request;
-    std::vector<Matrix<BaseFloat> > inputs;
-    ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
+  struct NnetGenerationOptions gen_config;
 
-    NnetComputation computation;
-    Compiler compiler(request, nnet);
+  std::vector<std::string> configs;
+  GenerateConfigSequence(gen_config, &configs);
+  Nnet nnet;
+  for (size_t j = 0; j < configs.size(); j++) {
+    KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
+    std::istringstream is(configs[j]);
+    nnet.ReadConfig(is);
+  }
 
-    CompilerOptions opts;
-    compiler.CreateComputation(opts, &computation);
-    {
-      std::ostringstream os;
-      computation.Print(os, nnet);
-      KALDI_LOG << "Generated computation with no optimization or shortcut is: " << os.str();
-    }
-    CheckComputationOptions check_config;
-    // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;
-    ComputationChecker checker(check_config, nnet, computation);
-    checker.Check();
+  ComputationRequest request;
+  std::vector<Matrix<BaseFloat> > inputs;
+  ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
 
-    CachingOptimizingCompiler opt_compiler(nnet, opt_config, compiler_config);
+  NnetComputation computation;
+  Compiler compiler(request, nnet);
 
-    const NnetComputation &computation_opt = *opt_compiler.Compile(request);
+  CompilerOptions opts;
+  compiler.CreateComputation(opts, &computation);
+  {
+    std::ostringstream os;
+    computation.Print(os, nnet);
+    KALDI_LOG << "Generated computation with no optimization or shortcut is: " << os.str();
+  }
+  CheckComputationOptions check_config;
+  // we can do the rewrite check since it's before optimization.
+  check_config.check_rewrite = true;
+  ComputationChecker checker(check_config, nnet, computation);
+  checker.Check();
 
-    {
-      std::ostringstream os;
-      computation_opt.Print(os, nnet);
-      KALDI_LOG << "Optimized computation is: " << os.str();
-    }
+  CachingOptimizingCompiler opt_compiler(nnet, opt_config, compiler_config);
 
-    NnetComputeOptions compute_opts;
-    if (RandInt(0, 1) == 0)
-      compute_opts.debug = true;
-
-    computation.ComputeCudaIndexes();
-    // computation_opt has already had this function called.
-
-    Nnet nnet_to_update(nnet);  // copy of the nnet that we update...  needed to
-                                // test the consolidation of backprop commands,
-                                // otherwise the optimized and non-optimized
-                                // comptuations differ.
-    bool is_gradient = true;  // with natural gradient, the consolidation would
-                              // affect the final model params -> test just the
-                              // gradient.
-    SetZero(is_gradient, &nnet_to_update);
-
-    NnetComputer computer(compute_opts,
-                          computation,
-                          nnet,
-                          &nnet_to_update);
-
-    Nnet nnet_opt(nnet);  // copy of the nnet for the optimized computation.
-                          // necessary in case backprop changes parameters.
-    Nnet nnet_opt_to_update(nnet_opt);
-    SetZero(is_gradient, &nnet_opt_to_update);
-
-    // NnetComputer for the optimized version of the computation.
-    NnetComputer computer_opt(compute_opts,
-                              computation_opt,
-                              nnet_opt,
-                              &nnet_opt_to_update);
-
-    // provide the input to the computations.
-    for (size_t i = 0; i < request.inputs.size(); i++) {
-      CuMatrix<BaseFloat> temp(inputs[i]);
-      KALDI_LOG << "Input sum is " << temp.Sum();
-      computer.AcceptInput(request.inputs[i].name, &temp);
-      CuMatrix<BaseFloat> temp2(inputs[i]);
-      computer_opt.AcceptInput(request.inputs[i].name, &temp2);
-    }
-    KALDI_LOG << "Running non-optimized forward computation";
-    computer.Run();
-    KALDI_LOG << "Running optimized forward computation";
-    computer_opt.Run();
+  const NnetComputation &computation_opt = *opt_compiler.Compile(request);
 
-    const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
-    KALDI_LOG << "Output sum (not optimized) is " << output.Sum();
-    const CuMatrixBase<BaseFloat> &output_opt(computer_opt.GetOutput("output"));
-    KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum();
-    if (!ApproxEqual(output, output_opt)) {
-      KALDI_WARN << "Non-optimized and optimized versions of the computation give "
-                 << "different outputs.";
-      return false;
-    }
+  {
+    std::ostringstream os;
+    computation_opt.Print(os, nnet);
+    KALDI_LOG << "Optimized computation is: " << os.str();
+  }
+
+  NnetComputeOptions compute_opts;
+  if (RandInt(0, 1) == 0)
+    compute_opts.debug = true;
+
+  computation.ComputeCudaIndexes();
+  // computation_opt has already had this function called.
+
+  Nnet nnet_to_update(nnet);  // copy of the nnet that we update...  needed to
+  // test the consolidation of backprop commands,
+  // otherwise the optimized and non-optimized
+  // comptuations differ.
+  bool is_gradient = true;  // with natural gradient, the consolidation would
+  // affect the final model params -> test just the
+  // gradient.
+  SetZero(is_gradient, &nnet_to_update);
+
+  NnetComputer computer(compute_opts,
+                        computation,
+                        nnet,
+                        &nnet_to_update);
+
+  Nnet nnet_opt(nnet);  // copy of the nnet for the optimized computation.
+  // necessary in case backprop changes parameters.
+  Nnet nnet_opt_to_update(nnet_opt);
+  SetZero(is_gradient, &nnet_opt_to_update);
+
+  // NnetComputer for the optimized version of the computation.
+  NnetComputer computer_opt(compute_opts,
+                            computation_opt,
+                            nnet_opt,
+                            &nnet_opt_to_update);
+
+  // provide the input to the computations.
+  for (size_t i = 0; i < request.inputs.size(); i++) {
+    CuMatrix<BaseFloat> temp(inputs[i]);
+    KALDI_LOG << "Input sum is " << temp.Sum();
+    computer.AcceptInput(request.inputs[i].name, &temp);
+    CuMatrix<BaseFloat> temp2(inputs[i]);
+    computer_opt.AcceptInput(request.inputs[i].name, &temp2);
+  }
+  KALDI_LOG << "Running non-optimized forward computation";
+  computer.Run();
+  KALDI_LOG << "Running optimized forward computation";
+  computer_opt.Run();
+
+  const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
+  KALDI_LOG << "Output sum (not optimized) is " << output.Sum();
+  const CuMatrixBase<BaseFloat> &output_opt(computer_opt.GetOutput("output"));
+  KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum();
+  if (!ApproxEqual(output, output_opt)) {
+    KALDI_WARN << "Non-optimized and optimized versions of the computation give "
+               << "different outputs.";
+    return false;
+  }
+
+  CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
+  output_deriv.SetRandn();
+  CuMatrix<BaseFloat> output_deriv_opt(output_deriv);
 
-    CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-    output_deriv.SetRandn();
-    CuMatrix<BaseFloat> output_deriv_opt(output_deriv);
-
-    if (request.outputs[0].has_deriv) {
-      computer.AcceptInput("output", &output_deriv);
-      computer_opt.AcceptInput("output", &output_deriv_opt);
-
-      KALDI_LOG << "Running non-optimized backward computation";
-      computer.Run();
-      KALDI_LOG << "Running optimized backward computation";
-      computer_opt.Run();
-      for (size_t i = 0; i < request.inputs.size(); i++) {
-        if (request.inputs[i].has_deriv) {
-          const CuMatrixBase<BaseFloat> &in_deriv =
-              computer.GetOutput(request.inputs[i].name);
-          const CuMatrixBase<BaseFloat> &in_deriv_opt =
-              computer_opt.GetOutput(request.inputs[i].name);
-          KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
-                    << "' (non-optimized) is " << in_deriv.Sum();
-          KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
-                    << "' (optimized) is " << in_deriv_opt.Sum();
-          if (!ApproxEqual(in_deriv, in_deriv_opt)) {
-            KALDI_WARN << "Non-optimized and optimized versions of the "
-                       << "computation give different input-derivs.";
-            return false;
-          }
+  if (request.outputs[0].has_deriv) {
+    computer.AcceptInput("output", &output_deriv);
+    computer_opt.AcceptInput("output", &output_deriv_opt);
+
+    KALDI_LOG << "Running non-optimized backward computation";
+    computer.Run();
+    KALDI_LOG << "Running optimized backward computation";
+    computer_opt.Run();
+    for (size_t i = 0; i < request.inputs.size(); i++) {
+      if (request.inputs[i].has_deriv) {
+        const CuMatrixBase<BaseFloat> &in_deriv =
+            computer.GetOutput(request.inputs[i].name);
+        const CuMatrixBase<BaseFloat> &in_deriv_opt =
+            computer_opt.GetOutput(request.inputs[i].name);
+        KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
+                  << "' (non-optimized) is " << in_deriv.Sum();
+        KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
+                  << "' (optimized) is " << in_deriv_opt.Sum();
+        if (!ApproxEqual(in_deriv, in_deriv_opt)) {
+          KALDI_WARN << "Non-optimized and optimized versions of the "
+                     << "computation give different input-derivs.";
+          return false;
         }
       }
     }
+  }
 
-    if (!NnetParametersAreIdentical(nnet_to_update,
-                                    nnet_opt_to_update, 1.0e-05)) {
-      KALDI_WARN << "Neural networks differ after training, between "
-                 << "optimized and non-optimized computation.";
-      return false;
-    }
+  if (!NnetParametersAreIdentical(nnet_to_update,
+                                  nnet_opt_to_update, 1.0e-05)) {
+    KALDI_WARN << "Neural networks differ after training, between "
+               << "optimized and non-optimized computation.";
+    return false;
+  } else {
+    return true;
   }
-  return true;
 }
 
 
 // This test runs the computation with and without optimization, and checks that
 // the outputs are the same.
-static void UnitTestNnetOptimize() {
+static void UnitTestNnetOptimizeInternal(int32 srand_seed) {
   NnetOptimizeOptions optimize_all;
   CachingOptimizingCompilerOptions compiler_all;
 
@@ -192,7 +194,7 @@ static void UnitTestNnetOptimize() {
 
   // this is useful for debugging as it removes nans:
   // optimize_all.initialize_undefined = false;
-  bool success = UnitTestNnetOptimizeWithOptions(optimize_all,
+  bool success = UnitTestNnetOptimizeWithOptions(srand_seed, optimize_all,
                                                  compiler_all);
   if (success)
     return;
@@ -204,48 +206,48 @@ static void UnitTestNnetOptimize() {
 
 
   compiler.use_shortcut = false;
-  bool succ_no_shortcut = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_shortcut = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                           compiler);
   compiler = compiler_all;
 
 
   optimize.propagate_in_place = false;
-  bool succ_no_propagate_in_place = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_propagate_in_place = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                                     compiler);
   optimize = optimize_all;
 
   optimize.backprop_in_place = false;
-  bool succ_no_backprop_in_place = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_backprop_in_place = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                                    compiler);
   optimize = optimize_all;
 
   optimize.optimize_row_ops = false;
-  bool succ_no_row_ops = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_row_ops = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                          compiler);
   optimize = optimize_all;
 
   optimize.convert_addition = false;
-  bool succ_no_convert_addition = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_convert_addition = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                                   compiler);
   optimize = optimize_all;
 
   optimize.remove_assignments = false;
-  bool succ_no_remove_assignments = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_remove_assignments = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                                     compiler);
   optimize = optimize_all;
 
   optimize.initialize_undefined = false;
-  bool succ_no_initialize_undefined = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_initialize_undefined = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                                       compiler);
   optimize = optimize_all;
 
   optimize.allocate_from_other = false;
-  bool succ_no_allocate_from_other = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_allocate_from_other = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                                      compiler);
   optimize = optimize_all;
 
   optimize.move_sizing_commands = false;
-  bool succ_no_move_sizing_commands = UnitTestNnetOptimizeWithOptions(optimize,
+  bool succ_no_move_sizing_commands = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
                                                                       compiler);
   optimize = optimize_all;
 
@@ -265,7 +267,13 @@ static void UnitTestNnetOptimize() {
 #undef KALDI_SUCCFAIL
 }
 
-
+static void UnitTestNnetOptimize() {
+  for (int32 srand_seed = 0; srand_seed < 40; srand_seed++) {
+    KALDI_LOG << "About to run UnitTestNnetOptimizeInternal with srand_seed = "
+              << srand_seed;
+    UnitTestNnetOptimizeInternal(srand_seed);
+  }
+}
 
 
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index de25b8bcabb..41f3acb3916 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -2293,9 +2293,9 @@ class ComputationExpander {
   // This function assumes that ComputeSubmatrixInfo() has already
   // been called.
   // Note: it returns true if the index 'old_row_index' into submatrix
-  // indexed 'old_submat_index' corresponds to an Index with n=0; otherwise
+  // indexed 'submat_index' corresponds to an Index with n=0; otherwise
   // it returns false and does not set the output values.
-  bool GetNewSubmatLocationInfo(int32 old_submat_index,
+  bool GetNewSubmatLocationInfo(int32 submat_index,
                                 int32 old_row_index,
                                 int32 *new_row_index,
                                 int32 *new_n_stride) const;
@@ -2395,24 +2395,26 @@ void ComputationExpander::ExpandRowsCommand(
   // 's1' and submat2 is the submatrix referred to in 's2'.
   // 'indexes' has the same size as the num-rows of submat1, and the values
   // in the vector are row-indexes into s2.
-  const std::vector<int32> &old_indexes = computation_.indexes[c_in.arg3];
+  int32 old_arg3 = c_out->arg3;
   c_out->arg3 = expanded_computation_->indexes.size();
   expanded_computation_->indexes.push_back(std::vector<int32>());
   std::vector<int32> &new_indexes = expanded_computation_->indexes.back();
+  const std::vector<int32> &old_indexes = computation_.indexes[old_arg3];
 
   int32 old_size = old_indexes.size(),
       num_n_values = num_n_values_,
-      new_size = expanded_computation_->submatrices[s1].num_rows;
+      new_s1_size = expanded_computation_->submatrices[s1].num_rows,
+      new_s2_size = expanded_computation_->submatrices[s2].num_rows;
   KALDI_ASSERT(old_size % 2 == 0 &&
                old_size == computation_.submatrices[s1].num_rows);
-  new_indexes.resize(new_size, -1);
+  new_indexes.resize(new_s1_size, -1);
 
   for (int32 i1 = 0; i1 < old_size; i1++) {
     int32 new_i1_n0, new_n_stride1;
     if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &new_n_stride1)) {
       // GetNewSubmatLocationInfo() returns true if this corresponds to
       // a Cindex with n == 0.
-      int32 i2 = old_indexes[i1];
+      int32 i2 = old_indexes[i1];  // note: i2 is the row index into submatrix s2.
       int32 new_i2_n0, new_n_stride2;
       if (i2 < 0) {  // if i2 is -1, we'll just fill any relevant positions in
                      // 'new_indexes' with -1's.
@@ -2422,9 +2424,11 @@ void ComputationExpander::ExpandRowsCommand(
         KALDI_ASSERT(ans);  // source should also be for n==0, because we don't
                             // (or at least shouldn't) create computations that
                             // mix up the 'n' values
-        for (int32 n = 0; n < num_n_values; n++) {
-          int32 new_i1 = new_i1_n0 + n * new_n_stride1,
-              new_i2 = new_i2_n0 + new_n_stride2;
+
+        int32 new_i1 = new_i1_n0, new_i2 = new_i2_n0;
+        for (int32 n = 0; n < num_n_values;
+             ++n, new_i1 += new_n_stride1, new_i2 += new_n_stride2) {
+          KALDI_ASSERT(new_i1 < new_s1_size && new_i2 < new_s2_size);
           new_indexes[new_i1] = new_i2;
         }
       }
@@ -2443,23 +2447,24 @@ void ComputationExpander::ExpandRowsMultiCommand(
       num_rows_old = computation_.submatrices[s1].num_rows,
       num_rows_new = expanded_computation_->submatrices[s1].num_rows;
 
-  const std::vector<std::pair<int32, int32> > &old_indexes_multi =
-      computation_.indexes_multi[c_in.arg2];
-  // old_indexes_multi is a vector that has the same size as the num-rows
-  // of submatrix s1.  It contains pairs that are either (-1, -1), or
-  // pairs (submatrix-index, row-index) referring to other submatrices
-  // in the computation.
-
-  KALDI_ASSERT(static_cast<int32>(old_indexes_multi.size()) == num_rows_old);
   KALDI_ASSERT(num_rows_old % 2 == 0);
   int32 num_n_values = num_n_values_;
 
-
+  int32 old_arg2 = c_out->arg2;
   c_out->arg2 = expanded_computation_->indexes_multi.size();
   expanded_computation_->indexes_multi.push_back(
       std::vector<std::pair<int32, int32> >());
   std::vector<std::pair<int32, int32> > &new_indexes_multi =
       expanded_computation_->indexes_multi.back();
+  const std::vector<std::pair<int32, int32> > &old_indexes_multi =
+      computation_.indexes_multi[old_arg2];
+  // old_indexes_multi is a vector that has the same size as the num-rows
+  // of submatrix s1.  It contains pairs that are either (-1, -1), or
+  // pairs (submatrix-index, row-index) referring to other submatrices
+  // in the computation.
+
+  KALDI_ASSERT(static_cast<int32>(old_indexes_multi.size()) == num_rows_old);
+
 
   new_indexes_multi.resize(num_rows_new,
                            std::pair<int32,int32>(-1, -1));
@@ -2508,23 +2513,25 @@ void ComputationExpander::ExpandRowRangesCommand(
       num_rows_new = expanded_computation_->submatrices[s1].num_rows;
   KALDI_ASSERT(static_cast<size_t>(c_in.arg3) <
                computation_.indexes_ranges.size());
-  const std::vector<std::pair<int32, int32> > &old_indexes_ranges =
-      computation_.indexes_ranges[c_in.arg3];
-  // old_indexes_ranges is a vector that has the same size as the num-rows of
-  // submatrix s1.  It contains pairs that are either two copies of the same
-  // value (in practice the pair (-1, -1)), or pairs (begin-row-index,
-  // end-row-index) representing the (begin,end) of a range in submatrix s2.
-  // Note: end-row-index is one past the end of the range, as for C++ iterators.
-
-  KALDI_ASSERT(static_cast<int32>(old_indexes_ranges.size()) == num_rows_old);
   KALDI_ASSERT(num_rows_old % 2 == 0);
   int32 num_n_values = num_n_values_;
 
+
+  int32 old_arg3 = c_out->arg3;
   c_out->arg3 = expanded_computation_->indexes_ranges.size();
   expanded_computation_->indexes_ranges.push_back(
       std::vector<std::pair<int32, int32> >());
   std::vector<std::pair<int32, int32> > &new_indexes_ranges =
       expanded_computation_->indexes_ranges.back();
+  const std::vector<std::pair<int32, int32> > &old_indexes_ranges =
+      computation_.indexes_ranges[old_arg3];
+  // old_indexes_ranges is a vector that has the same size as the num-rows of
+  // submatrix s1.  It contains pairs that are either two copies of the same
+  // value (in practice the pair (-1, -1)), or pairs (begin-row-index,
+  // end-row-index) representing the (begin,end) of a range in submatrix s2.
+  // Note: end-row-index is one past the end of the range, as for C++ iterators.
+
+  KALDI_ASSERT(static_cast<int32>(old_indexes_ranges.size()) == num_rows_old);
 
   new_indexes_ranges.resize(num_rows_new,
                            std::pair<int32,int32>(-1, -1));
@@ -2815,10 +2822,6 @@ void ComputationExpander::ComputePrecomputedIndexes() {
   int32 num_commands = computation_.commands.size(),
     num_precomputed_indexes = computation_.component_precomputed_indexes.size();
 
-  if (num_precomputed_indexes == 1)
-    return;  // Nothing to compute.  Note: element zero of
-             // component_precomputed_indexes is reserved for NULL.
-
   std::vector<bool> need_backprop(num_precomputed_indexes, false);
 
   std::vector<int32> component_index(num_precomputed_indexes, -1);
@@ -2860,8 +2863,8 @@ void ComputationExpander::ComputePrecomputedIndexes() {
     // the n indexes consist of the set (0, 1), and the computation we're
     // creating has more distinct n indexes than that.
     std::vector<Index> input_indexes, output_indexes;
-    ExpandIndexes(old_info.input_indexes, &new_info.input_indexes);
-    ExpandIndexes(old_info.output_indexes, &new_info.output_indexes);
+    ExpandIndexes(old_info.input_indexes, &input_indexes);
+    ExpandIndexes(old_info.output_indexes, &output_indexes);
     KALDI_ASSERT(component_index[p] >= 0);
     const Component *component = nnet_.GetComponent(component_index[p]);
     ComponentPrecomputedIndexes *expanded_precomputed_indexes =
@@ -2877,18 +2880,19 @@ void ComputationExpander::ComputePrecomputedIndexes() {
 
 
 bool ComputationExpander::GetNewSubmatLocationInfo(
-    int32 old_submat_index, int32 old_row_index,
+    int32 submat_index, int32 old_row_index,
     int32 *new_row_index, int32 *new_n_stride) const {
-  int32 matrix_index = computation_.submatrices[old_submat_index].matrix_index,
-      row_offset = computation_.submatrices[old_submat_index].row_offset;
+  int32 matrix_index = computation_.submatrices[submat_index].matrix_index,
+   old_row_offset = computation_.submatrices[submat_index].row_offset,
+   new_row_offset = expanded_computation_->submatrices[submat_index].row_offset;
 
   const NnetComputation::MatrixDebugInfo &debug_info_in =
       computation_.matrix_debug_info[matrix_index];
-  if (debug_info_in.cindexes[old_row_index + row_offset].second.n != 0)
+  if (debug_info_in.cindexes[old_row_index + old_row_offset].second.n != 0)
     return false;
-  GetNewMatrixLocationInfo(matrix_index, old_row_index + row_offset,
+  GetNewMatrixLocationInfo(matrix_index, old_row_index + old_row_offset,
                            new_row_index, new_n_stride);
-  *new_row_index -= row_offset;
+  *new_row_index -= new_row_offset;
   return true;
 }
 
@@ -2897,9 +2901,7 @@ void ComputationExpander::GetNewMatrixLocationInfo(
     int32 *new_row_index, int32 *new_n_stride) const {
   bool n_is_fast = n_fast_[old_matrix_index];
   int32 num_rows = computation_.matrices[old_matrix_index].num_rows;
-  int32 n_stride;
   if (n_is_fast) {
-    n_stride = 1;
     // If the n index varies fast for this matrix, then the old row-index
     // should be a multiple of 2 because:
     //  - we assume that the input computation was built for 2 n-values
@@ -2957,9 +2959,7 @@ void ComputationExpander::GetNewLocationInfo(
   int32 num_indexes = indexes.size();
   KALDI_ASSERT(num_indexes > 0 && num_indexes % 2 == 0 &&
                indexes.front().n == 0 && indexes.back().n == 1);
-  int32 n_stride;
   if (is_fast) {
-    n_stride = 1;
     // If the n index varies fast for this matrix, then the old row-index
     // should be a multiple of 2 because:
     //  - we assume that the input computation was built for 2 n-values
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index c2cee31bbcc..6da7699cb93 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -745,6 +745,8 @@ const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut(
   ExpandComputation(nnet_, request.misc_info, *mini_computation,
                     need_debug_info, num_n_values, ans);
 
+  ans->ComputeCudaIndexes();
+
   return ans;
 }
 

From 8fd19592716b9ec51ceee9758429b22f9d38ba6c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 15 Dec 2016 18:14:09 -0500
Subject: [PATCH 026/213] Small documentation fix

---
 src/nnet3/nnet-am-decodable-simple.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index e604765e09a..acf0ba8e63a 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -72,11 +72,11 @@ struct NnetSimpleComputationOptions {
                    "of the neural net's inherent right context (may be useful in "
                    "recurrent setups");
     opts->Register("extra-left-context-initial", &extra_left_context_initial,
-                   "If >0, overrides the --extra-left-context value at the start "
-                   "of an utterance.");
+                   "If >= 0, overrides the --extra-left-context value at the "
+                   "start of an utterance.");
     opts->Register("extra-right-context-final", &extra_right_context_final,
-                   "If >0, overrides the --extra-right-context value at the end "
-                   "of an utterance.");
+                   "If >= 0, overrides the --extra-right-context value at the "
+                   "end of an utterance.");
     opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
                    "Required if the frame-rate of the output (e.g. in 'chain' "
                    "models) is less than the frame-rate of the original "

From 67a8f7ac42c6b2de33ac6d4cd6b3d41df8f1771b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 15 Dec 2016 18:33:16 -0500
Subject: [PATCH 027/213] Remove no-longer-used option --cut-zero-frames from
 chain supervision-creation code

---
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh |  4 +---
 src/chain/chain-supervision.cc          | 21 ---------------------
 src/chain/chain-supervision.h           | 21 ---------------------
 src/chainbin/nnet3-chain-get-egs.cc     | 24 +++++++-----------------
 4 files changed, 8 insertions(+), 62 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index c7263f41698..7b330f8f717 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -26,8 +26,6 @@ frames_per_eg=25   # number of feature frames example (not counting added contex
 frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
-cut_zero_frames=-1  # if activated, activates new-style derivative weights.. i'll reorganize
-                    # this if it works well.
 frame_subsampling_factor=3 # frames-per-second of features we train on divided
                            # by frames-per-second at output of chain model
 alignment_subsampling_factor=3 # frames-per-second of input alignments divided
@@ -294,7 +292,7 @@ if [ $stage -le 2 ]; then
 fi
 
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames"
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
 
 
 [ -z $valid_left_context ] &&  valid_left_context=$left_context;
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index aad1320e0a0..b5597b15667 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -804,26 +804,5 @@ void GetWeightsForRanges(int32 range_length,
 }
 
 
-void GetWeightsForRangesNew(int32 range_length,
-                            int32 num_frames_zeroed,
-                            const std::vector<int32> &range_starts,
-                            std::vector<Vector<BaseFloat> > *weights) {
-  KALDI_ASSERT(range_length > 0 && num_frames_zeroed * 2 < range_length);
-  int32 num_ranges = range_starts.size();
-  weights->resize(num_ranges);
-  for (int32 i = 0; i < num_ranges; i++) {
-    (*weights)[i].Resize(range_length);
-    (*weights)[i].Set(1.0);
-  }
-  if (num_frames_zeroed == 0)
-    return;
-  for (int32 i = 1; i < num_ranges; i++)
-    (*weights)[i].Range(0, num_frames_zeroed).Set(0.0);
-  for (int32 i = 0; i + 1 < num_ranges; i++)
-    (*weights)[i].Range(range_length - num_frames_zeroed,
-                        num_frames_zeroed).Set(0.0);
-}
-
-
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index 2dda8baf1e4..a94f68ade90 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -402,27 +402,6 @@ void GetWeightsForRanges(int32 range_length,
                          std::vector<Vector<BaseFloat> > *weights);
 
 
-/// This is a newer version of GetWeightsForRanges with a simpler behavior
-/// than GetWeightsForRanges and a different purpose.  Instead of aiming to
-/// create weights that sum to one over the whole file, the purpose is to
-/// zero out the derivative weights for a certain number of frames to each
-/// side of every 'cut point' in the numerator lattice [by numerator lattice,
-/// what I mean is the FST that we automatically generate from the numerator
-/// alignment or lattice].  So we don't zero out the weights for the very
-/// beginning or very end of each original utterance, just those where
-/// we split the utterance into pieces.  We believe there is an incentive
-/// for the network to produce deletions near the edges, and this aims to fix
-/// this problem.
-/// range_length is the length of each range of times (so range_starts[0]
-/// represents the start of a range of t values of length 'range_length'
-/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames
-/// on each side of the cut point on which we are supposed to zero out the
-/// derivative.
-void GetWeightsForRangesNew(int32 range_length,
-                            int32 num_frames_zeroed,
-                            const std::vector<int32> &range_starts,
-                            std::vector<Vector<BaseFloat> > *weights);
-
 
 typedef TableWriter<KaldiObjectHolder<Supervision> > SupervisionWriter;
 typedef SequentialTableReader<KaldiObjectHolder<Supervision> > SequentialSupervisionReader;
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index cc463d179da..968a50af889 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -49,7 +49,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         int32 frames_per_eg,
                         int32 frames_overlap_per_eg,
                         int32 frame_subsampling_factor,
-                        int32 cut_zero_frames,
                         int64 *num_frames_written,
                         int64 *num_egs_written,
                         NnetChainExampleWriter *example_writer) {
@@ -86,7 +85,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
     return ProcessFile(normalization_fst, feats_new, ivector_feats,
                        supervision, utt_id, compress, left_context, right_context,
                        frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor,
-                       cut_zero_frames, num_frames_written, num_egs_written,
+                       num_frames_written, num_egs_written,
                        example_writer);
   }
 
@@ -116,15 +115,10 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   // to the edge are not as accurate as they could be, because when we split we
   // don't know the correct alphas and betas).
   std::vector<Vector<BaseFloat> > deriv_weights;
-  if (cut_zero_frames >= 0)
-    chain::GetWeightsForRangesNew(frames_per_eg_subsampled,
-                                  cut_zero_frames / frame_subsampling_factor,
-                                  range_starts_subsampled,
-                                  &deriv_weights);
-  else
-    chain::GetWeightsForRanges(frames_per_eg_subsampled,
-                               range_starts_subsampled,
-                               &deriv_weights);
+
+  chain::GetWeightsForRanges(frames_per_eg_subsampled,
+                             range_starts_subsampled,
+                             &deriv_weights);
 
   if (range_starts_subsampled.empty()) {
     KALDI_WARN << "No output for utterance " << utt_id
@@ -250,10 +244,6 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format (recommended)");
-    po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames "
-                "(measured before subsampling) to zero the derivative on each "
-                "side of a cut point (if set, activates new-style derivative "
-                "weights)");
     po.Register("left-context", &left_context, "Number of frames of left "
                 "context the neural net requires.");
     po.Register("right-context", &right_context, "Number of frames of right "
@@ -276,7 +266,7 @@ int main(int argc, char *argv[]) {
                 "frame-rate of the input");
 
     po.Read(argc, argv);
-    
+
     srand(srand_seed);
 
     if (po.NumArgs() < 3 || po.NumArgs() > 4) {
@@ -355,7 +345,7 @@ int main(int argc, char *argv[]) {
                         key, compress,
                         left_context, right_context, num_frames,
                         num_frames_overlap, frame_subsampling_factor,
-                        cut_zero_frames, &num_frames_written, &num_egs_written,
+                        &num_frames_written, &num_egs_written,
                         &example_writer))
           num_done++;
         else

From 1e215a842a43807d900aa35f86e1daf8ae5d1f77 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 16 Dec 2016 17:59:29 -0500
Subject: [PATCH 028/213] Some draft code, on the way to changing
 egs-extraction code to allow different-sized egs, and different begin/end l/r
 context

---
 src/chainbin/nnet3-chain-get-egs.cc |  4 +-
 src/nnet3/nnet-example-utils.h      | 93 ++++++++++++++++++++++++++++-
 src/nnet3bin/nnet3-get-egs.cc       | 32 +++++-----
 3 files changed, 109 insertions(+), 20 deletions(-)

diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 968a50af889..6f77a3c208b 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -252,9 +252,7 @@ int main(int argc, char *argv[]) {
                 "that each example contains.  Will be rounded up to a multiple "
                 "of --frame-subsampling-factor.");
     po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
-                "overlap between each example (could be useful in conjunction "
-                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
-                "Each time we shift by --num-frames minus --num-frames-overlap.");
+                "overlap between each example.");
     po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
                 "features, as a matrix.");
     po.Register("srand", &srand_seed, "Seed for random number generator "
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 6ebffcf1d50..c0f76f3bf21 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -71,8 +71,94 @@ void WriteVectorAsChar(std::ostream &os,
 
 // Reads data written by WriteVectorAsChar.
 void ReadVectorAsChar(std::istream &is,
-                             bool binary,
-                             Vector<BaseFloat> *vec);
+                      bool binary,
+                      Vector<BaseFloat> *vec);
+
+
+// Warning: after reading in the values from the command line
+// (Register() and then then po.Read()), you should then call ComputeDerived()
+// to set up the 'derived values' (parses 'num_frames_str').
+struct ExampleExtractionConfig {
+  int32 left_context;
+  int32 right_context;
+  int32 left_context_initial;
+  int32 right_context_final;
+  int32 num_frames_overlap;
+  std::string num_frames_str;
+
+
+  // The following parameters are derived parameters, computed by
+  // ComputeDerived().
+  int32 num_frames;  // the 'principal' number of frames
+  std::vector<int32> num_frames_alternative;
+
+  ExampleExtractionConfig():
+      left_context(0), right_context(0),
+      left_context_initial(-1), right_context_initial(-1),
+      num_frames_overlap(0),
+      num_frames_str("1"), num_frames(-1) { }
+
+  /// This function decodes 'num_frames_str' into 'num_frames' and 'num_frames_alternatives',
+  /// and ensures that 'num_frames', and the members of num_frames_alternatives' are
+  /// multiples of 'frame_subsampling_factor'.
+  ///
+  void ComputeDerived();
+
+  void Register(OptionsItf *po) {
+    po->Register("left-context", &left_context, "Number of frames of left "
+                 "context of input features that are added to each "
+                 "example");
+    po->Register("right-context", &right_context, "Number of frames of right "
+                 "context of input features that are added to each "
+                 "example");
+    po->Register("left-context-initial", &left_context, "Number of frames "
+                 "of left context of input features that are added to each "
+                 "example at the start of the utterance (if <0, this "
+                 "defaults to the same as --left-context)");
+    po->Register("right-context-final", &right_context, "Number of frames "
+                 "of right context of input features that are added to each "
+                 "example at the end of the utterance (if <0, this "
+                 "defaults to the same as --right-context)");
+    po->Register("right-context", &right_context, "Number of frames of right "
+                 "context of input features that are added to each "
+                 "example");
+    po->Register("num-frames", &num_frames_str, "Number of frames with labels "
+                "that each example contains (i.e. the left and right context "
+                "are to be added to this).  May just be an integer (e.g. "
+                "--num-frames=8), or an principal value followed by "
+                "alternative values to be used at most once for each utterance "
+                "to deal with odd-sized input, e.g. --num-frames=40,25,50 means "
+                "that most of the time the number of frames will be 40, but to "
+                "deal with odd-sized inputs we may also generate egs with these "
+                "other sizes.  All these values will be rounded up to the "
+                "closest multiple of --frame-subsampling-factor.");
+    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
+                "overlap between adjacent examples (advisory, will not be "
+                "exactly enforced)");
+    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                "if the frame-rate of the output labels in the generated "
+                "examples will be less than the frame-rate at the input");
+  }
+};
+
+
+
+void ComputeExampleTimeInfo(const ExampleExtractionConfig &config,
+                            int32 num_frames_in_utt,
+
+                            SplitIntoRanges(int32 num_frames,
+                     int32 frames_per_range,
+                     std::vector<int32> *range_starts);
+
+
+
+struct ExampleTimeInfo {
+  int32 first_frame;
+  int32 num_frames;
+  int32 left_context;
+  int32 right_context;
+};
+
 
 // This function rounds up the quantities 'num_frames' and 'num_frames_overlap'
 // to the nearest multiple of the frame_subsampling_factor
@@ -81,6 +167,9 @@ void RoundUpNumFrames(int32 frame_subsampling_factor,
                       int32 *num_frames_overlap);
 
 
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 75f264f1ceb..897ffad7b48 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -43,7 +43,7 @@ static void ProcessFile(const MatrixBase<BaseFloat> &feats,
                         int64 *num_egs_written,
                         NnetExampleWriter *example_writer) {
   KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
-  
+
   for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) {
 
     // actual_frames_per_eg is the number of frames with nonzero
@@ -57,7 +57,7 @@ static void ProcessFile(const MatrixBase<BaseFloat> &feats,
     int32 tot_frames = left_context + frames_per_eg + right_context;
 
     Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
-    
+
     // Set up "input_frames".
     for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
       int32 t2 = j + t;
@@ -69,7 +69,7 @@ static void ProcessFile(const MatrixBase<BaseFloat> &feats,
     }
 
     NnetExample eg;
-    
+
     // call the regular input "input".
     eg.io.push_back(NnetIo("input", - left_context,
                            input_frames));
@@ -93,10 +93,10 @@ static void ProcessFile(const MatrixBase<BaseFloat> &feats,
       labels[i] = pdf_post[t + i];
     // remaining posteriors for frames are empty.
     eg.io.push_back(NnetIo("output", num_pdfs, 0, labels));
-    
+
     if (compress)
       eg.Compress();
-      
+
     std::ostringstream os;
     os << utt_id << "-" << t;
 
@@ -137,30 +137,32 @@ int main(int argc, char *argv[]) {
         "nnet3-get-egs --num-pdfs=2658 --left-context=12 --right-context=9 --num-frames=8 \"$feats\"\\\n"
         "\"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
         "   ark:- \n";
-        
+
 
     bool compress = true;
     int32 num_pdfs = -1, left_context = 0, right_context = 0,
         num_frames = 1, length_tolerance = 100;
-        
+
     std::string ivector_rspecifier;
-    
+
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format.");
     po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic "
                 "model");
     po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
+                "context of input features that are added to each "
+                "example");
     po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
+                "context of input features that are added to each "
+                "example");
     po.Register("num-frames", &num_frames, "Number of frames with labels "
                 "that each example contains.");
     po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
                 "features, as a matrix.");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -170,7 +172,7 @@ int main(int argc, char *argv[]) {
 
     if (num_pdfs <= 0)
       KALDI_ERR << "--num-pdfs options is required.";
-    
+
 
     std::string feature_rspecifier = po.GetArg(1),
         pdf_post_rspecifier = po.GetArg(2),
@@ -181,10 +183,10 @@ int main(int argc, char *argv[]) {
     RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
     RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
-    
+
     int32 num_done = 0, num_err = 0;
     int64 num_frames_written = 0, num_egs_written = 0;
-    
+
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
       const Matrix<BaseFloat> &feats = feat_reader.Value();
@@ -221,7 +223,7 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
-          
+
         ProcessFile(feats, ivector_feats, pdf_post, key, compress,
                     num_pdfs, left_context, right_context, num_frames,
                     &num_frames_written, &num_egs_written,

From 01295659c3f7153a136a7b51307a8f714e19ab30 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 19 Dec 2016 12:02:41 -0800
Subject: [PATCH 029/213] Draft of UtteranceSplitter and related code

---
 src/nnet3/nnet-example-utils.cc | 345 ++++++++++++++++++++++++++++++++
 src/nnet3/nnet-example-utils.h  | 136 +++++++++++--
 2 files changed, 466 insertions(+), 15 deletions(-)

diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 30f7840f6f8..547c70578ab 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -286,5 +286,350 @@ void RoundUpNumFrames(int32 frame_subsampling_factor,
 }
 
 
+/*
+  This comment describes the idea behind what InitChunkSize() is supposed to do,
+  and how it relates to the purpose of class UtteranceSplitter.
+
+  Class UtteranceSplitter is supposed to tell us, for a given utterance length,
+  what chunk sizes to use.  The chunk sizes it may choose are:
+    - zero or more chunks of the 'principal' size (the first-listed value in
+      num-frames)
+    - at most two chunks of 'alternative' num-frames (any but the first-listed
+      num-frames).
+
+  (and an empty list of chunks is not allowed as a split).  A split is
+  effectively a multiset of chunk-sizes (the order will be randomized by the
+  caller).  We represent it in code as a list of chunk-sizes, represented as a
+  std::vector, which is sorted to get a unique representation without repeats of
+  different orderings.
+
+  The choice of spilt is determined by a cost-function that depends on the sum
+  of the chunk-sizes in the split and the length of the utterance: the idea is
+  that we want the sum of chunk-sizes in the split to be as close as possible to
+  the utterance length.  The cost-function penalizes the sum of chunk-sizes
+  being smaller than the utterance-length (leading to gaps) twice as much as
+  when the sum of chunk-sizes is larger than the utterance length.  I.e.
+    cost(chunk_size_sum, utt_length) = (chunk_size_sum > utt_length ?
+                                         chunk_size_sum - utt_length :
+                                         2 * (utt_length - chunk_size_sum))
+  [but as a special case, set c to infinity if the largest chunk size in the
+  split is longer than the utterance length; we couldn't, in that case, use
+  this split for this utterance].
+
+
+  We want to make sure a good variety of combinations of chunk sizes are chosen
+  in case there are ties from the cost function.  For each utterance length
+  we store the set of splits, whose costs are within 2
+  of the best cost available for that utterance length.  When asked to find
+  chunks for a particular utterance of that length, we will choose randomly
+  from that pool of splits.
+ */
+void UtteranceSplitter::InitChunkSize() {
+  int32 max_utterance_length = MaxUtteranceLength();
+
+  // The 'splits' vector is a list of possible splits (a split being
+  // a multiset of chunk-sizes, represented as a sorted vector).
+  // The vector 'splits' is itself sorted.
+  std::vector<std::vector<int32> > splits;
+  InitSplits(&splits);
+
+
+  // Define a split-index 0 <= s < splits.size() as index into the 'splits'
+  // vector, and let a cost c >= 0 represent the mismatch between an
+  // utterance length and the total length of the chunk sizes in a split:
+
+  //  c(chunk_size_sum, utt_length) = (chunk_size_sum > utt_length ?
+  //                                    chunk_size_sum - utt_length :
+  //                                    2 * (utt_length - chunk_size_sum))
+  // [but as a special case, set c to infinity if the largest chunk size in the
+  //  split is longer than the utterance length; we couldn't, in that case, use
+  //  this split for this utterance].
+
+  // 'costs_for_length[u][s]', indexed by utterance-length u and then split,
+  // contains the cost for utterance-length u and split s.
+
+  std::vector<std::vector<int32> > costs_for_length(
+      max_utterance_length + 1);
+  int32 num_splits = splits.size();
+
+
+  for (int32 u = 0; u <= max_utterance_length; u++)
+    pairs_for_length[u].reserve(num_splits);
+
+  for (int32 s = 0; s < num_splits; s++) {
+    const std::vector<int32> &split = splits[s];
+    int32 chunk_size_sum = std::accumulate(split.begin(), split.end(),
+                                           int32(0)),
+        max_chunk_size = *std::max_element(split.begin(), split.end());
+    for (int32 u = 0; u <= max_utterance_length; u++) {
+      // c is the cost for this utterance length and this split.  We penalize
+      // gaps twice as strongly as overlaps, based on the intuition that
+      // completely throwing out frames of data is worse than counting them
+      // twice.  It might be possible to come up with some kind of mathematical
+      // justification for this based on variance of the estimated gradient.
+      int32 c = (chunk_size_sum > u ? chunk_size_sum - u :
+                 2 * (u - chunk_size_sum));
+      if (max_chunk_size > u)
+        c = std::numeric_limits<int32>::max();
+      pairs_for_length[u].push_back(c);
+    }
+  }
+
+
+  splits_for_length_.resize(max_utterance_length + 1);
+
+
+  for (int32 u = 0; u <= max_utterance_length; u++) {
+    const std::vector<int32> &costs = costs_for_length[u];
+    int32 min_cost = std::min_element(costs.begin(), costs.end());
+    if (min_cost == std::numeric_limits<int32>::max()) {
+      // All costs were infinity, becaues this utterance-length u is shorter
+      // than the smallest chunk-size.  Leave splits_for_length_[u] as empty
+      // for this utterance-length, meaning we will not be able to choose any
+      // split, and such utterances will be discarded.
+      continue;
+    }
+    int32 cost_threshold = 2;  // We will choose pseudo-randomly from splits
+                               // that are within this distance from the best
+                               // cost.
+    std::vector<int32> possible_splits;
+    std::vector<int32>::const_iterator iter = costs.begin(), end = costs.end();
+    int32 s = 0;
+    for (; iter != end; ++iter,++s)
+      if (*iter < min_cost + cost_threshold)
+        splits_for_length_[u].push_back(splits[s]);
+  }
+
+  if (GetVerboseLevel() >= 3) {
+    std::ostringstream os;
+    for (int32 u = 0; u <= max_utterance_length; u++) {
+      if (!splits_for_length_[u].empty()) {
+        os << u << "=(";
+        std::vector<std::vector<int32 > >::const_iterator
+            iter1 = splits_for_length_[u].begin(),
+            end1 = splits_for_length_[u].end();
+
+        while (iter1 != end1) {
+          std::vector<int32>::const_iterator iter2 = iter1->begin(),
+              end2 = iter1->end();
+          while (iter2 != end2) {
+            os << *iter2;
+            ++iter2;
+            if (iter2 != end2) os << ",";
+          }
+          ++iter1;
+          if (iter1 != end1) os << "/";
+        }
+        os << ")";
+        if (u < max_utterance_length) os << ", ";
+      }
+    }
+    KALDI_VLOG(3) << "Utterance-length-to-splits map is: " << os.str();
+  }
+}
+
+
+void GetChunkSizesForUtterance(int32 utterance_length,
+                               std::vector<int32> *chunk_sizes) const {
+  KALDI_ASSERT(!splits_for_length.empty());
+  // 'primary_length' is the first-specified num-frames.
+  // It's the only chunk that may be repeated an arbitrary number
+  // of times.
+  int32 primary_length = config_.num_frames[0],
+      max_tabulated_length = splits_for_length_.size() - 1,
+      num_primary_length_repeats = 0;
+
+  KALDI_ASSERT(utterance_length >= 0);
+  while (utterance_length > max_tabulated_length) {
+    utterance_length -= primary_length;
+    num_primary_length_repeats++;
+  }
+  KALDI_ASSERT(utterance_length >= 0);
+  const std::vector<std::vector<int32> > &possible_splits =
+      splits_for_length_[utterance_length];
+  int32 num_possible_splits = possible_splits.size(),
+      randomly_chosen_split = RandInt(0, num_possible_splits - 1);
+  *chunk_sizes = possible_splits[randomly_chosen_split];
+  for (int32 i = 0; i < num_primary_length_repeats; i++)
+    chunk_sizes->push_back(primary_length);
+  // Randomize the order in which the chunks appear.
+  std::random_shuffle(chunk_sizes->begin(),
+                      chunk_sizes->end());
+}
+
+
+int32 UtteranceSplitter::MaxUtteranceLength() const {
+  int32 num_lengths = config_.num_frames.size();
+  KALDI_ASSERT(num_lengths > 0);
+  // 'primary_length' is the first-specified num-frames.
+  // It's the only chunk that may be repeated an arbitrary number
+  // of times.
+  int32 primary_length = config_.num_frames[0],
+      max_length = primary_length;
+  for (int32 i = 0; i < num_lengths; i++) {
+    KALDI_ASSERT(config_.num_frames[i] > 0);
+    max_length = std::max(config_.num_frames[i], max_length);
+  }
+  return 2 * max_length + primary_length;
+}
+
+void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) const {
+  // we consider splits whose total length is up to MaxUtteranceLength() +
+  // primary_length.  We can be confident without doing a lot of math, that
+  // multisets above this length will never be chosen for any utterance-length
+  // up to MaxUtteranceLength().
+  int32 primary_length = config_.num_frames[0],
+      length_ceiling = MaxUtteranceLength() + primary_length;
+
+  typedef std::unordered_set<std::vector<int32>, VectorHasher<int32> > SetType;
+
+  SetType splits_set;
+
+  int32 num_lengths = config_.num_frames.size();
+
+  // The splits we are allow are: zero to two 'alternate' lengths, plus
+  // an arbitrary number of repeats of the 'primary' length.  The repeats
+  // of the 'primary' length are handled by the inner loop over n.
+  // The zero two two 'alternate' lengths are handled by the loops over
+  // i and j.  i == 0 and j == 0 are special cases; they mean, no
+  // alternate is chosen.
+  for (int32 i = 0; i < num_lengths; i++) {
+    for (int32 j = 0; j < num_length; j++) {
+      std::vector<int32> vec;
+      if (i > 0)
+        vec.push_back(config_.num_frames[i]);
+      if (j > 0)
+        vec.push_back(config_.num_frames[j]);
+      for (int32 n = 0;
+           std::accumulate(vec.begin(), vec.end(), int32(0)) <= length_ceiling;
+           ++n, vec.push_back(primary_length)) {
+        std::sort(vec.begin(), vec.end());  // we don't want to treat different
+                                            // orderings of the same values as
+                                            // different, so sort them.
+        if (!vec.empty()) // Don't allow the empty vector as a split.
+          splits_set.insert(vec);
+      }
+    }
+  }
+  for (SetType::const_iterator iter = splits_set.begin();
+       iter != splits_set.end(); ++iter)
+    splits->push_back(*iter);
+  std::sort(splits->begin(), splits->end());  // make the order deterministic,
+                                              // for consistency of output
+                                              // between runs and C libraries.
+}
+
+
+// static
+void UtteranceSplitter::DistributeRandomly(int32 n, std::vector<int32> *vec) {
+  KALDI_ASSERT(!vec->empty());
+  int32 size = vec->size();
+  if (n < 0) {
+    DistributeRandomly(n, vec);
+    for (int32 i = 0; i < size; i++)
+      (*vec)[i] *= -1;
+    return;
+  }
+  // from this point we know n >= 0.
+  int32 common_part = n / size,
+      remainder = n % size, i;
+  for (i = 0; i < remainder; i++) {
+    (*vec)[i] = common_part + 1;
+  }
+  for (; i < size; i++) {
+    (*vec)[i] = common_part;
+  }
+  std::random_shuffle(vec->begin(), vec->end());
+  KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
+}
+
+
+void UtteranceSplitter::GetGapSizes(int32 utterance_length,
+                                    bool enforce_subsampling_factor,
+                                    const std::vector<int32> &chunk_sizes,
+                                    std::vector<int32> *gap_sizes) const {
+  if (chunk_sizes.empty()) {
+    gap_sizes->clear();
+    return;
+  }
+  if (enforce_subsamping_factor && config_.frame_subsampling_factor > 1) {
+    int32 sf = config_.frame_subsampling_factor, size = chunk_sizes.size();
+    int32 utterance_length_reduced = (utterance_length + (sf - 1)) / sf;
+    std::vector<int32> chunk_sizes_reduced(chunk_sizes);
+    for (int32 i = 0; i < size; i++) {
+      KALDI_ASSERT(chunk_sizes[i] % config_.frame_subsampling_factor == 0);
+      chunk_sizes_reduced[i] /= config_.frame_subsampling_factor;
+    }
+    GetGapSizes(utterance_length_reduced, false,
+                chunk_sizes_reduced, gap_sizes);
+    KALDI_ASSERT(gap_sizes->size() == static_cast<size_t>(size));
+    for (int32 i = 0; i < size; i++)
+      (*gap_sizes)[i] *= config_.frame_subsampling_factor;
+    return;
+  }
+  int32 num_chunks = chunk_sizes.size(),
+      total_of_chunk_sizes = std::accumulate(chunk_sizes.begin(),
+                                             chunk_sizes.end(),
+                                             int32(0)),
+      total_gap = utterance_length - total_of_chunk_sizes;
+  gap_sizes->resize(num_chunks);
+
+  if (total_gap < 0) {
+    // there is an overlap.  Overlaps can only go between chunks, not at the
+    // beginning or end of the utterance.
+    if (num_chunks == 1) {
+      // there needs to be an overlap, but there is only one chunk... this means
+      // the chunk-size exceeds the utterance length, which is not allowed.
+      KALDI_ERR << "Chunk size is " << chunk_sizes[0]
+                << " but utterance length is only "
+                << utterance_length;
+    }
+
+    // note the elements of 'overlaps' will be <= 0.
+    std::vector<int32> overlaps(num_chunks - 1);
+    DistributeRandomly(total_gap, &num_overlap_locations);
+    (*gap_sizes)[0] = 0;  // no gap before 1st chunk.
+    for (int32 i = 1; i < num_chunks; i++)
+      (*gap_sizes)[i] = overlaps[i-1];
+  } else {
+    // There may be a gap.  Gaps can go at the start or end of the utterance, or
+    // between segments.
+    std::vector<int32> gaps(num_chunks + 1);
+    DistributeRandomly(total_gap, &gaps);
+    // the last element of 'gaps', the one at the end of the utterance, is
+    // implicit and doesn't have to be written to the output.
+    for (int32 i = 0; i < num_chunks; i++)
+      (*gap_sizes)[i] = gaps[i];
+  }
+}
+
+
+void UtteranceSplitter::GetChunksForUtterance(
+    int32 utterance_length,
+    std::vector<ChunkTimeInfo> *chunk_info) const {
+  std::vector<int32> chunk_sizes;
+  GetChunkSizesForUtterance(utterance_length, &chunk_sizes);
+  std::vector<int32> gaps(chunk_sizes.size());
+  GetGapSizes(utterance_length, true, chunk_sizes, &gap_sizes);
+  int32 num_chunks = chunk_sizes.size();
+  chunk_info->resize(num_chunks);
+  int32 t = 0;
+  for (int32 i = 0; i < num_chunks; i++) {
+    t += gaps[i];
+    ChunkTimeInfo &info = (*chunk_info)[i];
+    info.first_frame = t;
+    info.num_frames = chunk_sizes[i];
+    info.left_context = (i == 0 && config_.left_context_initial >= 0 ?
+                         config_.left_context_initial : config_.left_context);
+    info.right_context = (i == 0 && config_.right_context_final >= 0 ?
+                         config_.right_context_final : config_.right_context);
+    t += chunk_sizes[i];
+  }
+  // check that the end of the last chunk doesn't go more than
+  // 'config_.frame_subsampling_factor - 1' frames past the end
+  // of the utterance.  That amount, we treat as rounding error.
+  KALDI_ASSERT(t - utterance_length < config_.frame_subsampling_factor);
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index c0f76f3bf21..d02aa336a10 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -89,14 +89,17 @@ struct ExampleExtractionConfig {
 
   // The following parameters are derived parameters, computed by
   // ComputeDerived().
-  int32 num_frames;  // the 'principal' number of frames
-  std::vector<int32> num_frames_alternative;
+
+  // the first element of the 'num_frames' vector is the 'principal' number of
+  // frames; the remaining elements are alternatives to the principal number of
+  // frames, to be used at most once or twice per file.
+  std::vector<int32> num_frames;
 
   ExampleExtractionConfig():
       left_context(0), right_context(0),
       left_context_initial(-1), right_context_initial(-1),
       num_frames_overlap(0),
-      num_frames_str("1"), num_frames(-1) { }
+      num_frames_str("1") { }
 
   /// This function decodes 'num_frames_str' into 'num_frames' and 'num_frames_alternatives',
   /// and ensures that 'num_frames', and the members of num_frames_alternatives' are
@@ -111,13 +114,13 @@ struct ExampleExtractionConfig {
     po->Register("right-context", &right_context, "Number of frames of right "
                  "context of input features that are added to each "
                  "example");
-    po->Register("left-context-initial", &left_context, "Number of frames "
-                 "of left context of input features that are added to each "
-                 "example at the start of the utterance (if <0, this "
+    po->Register("left-context-initial", &left_context_initial, "Number of "
+                 "frames of left context of input features that are added to "
+                 "each example at the start of the utterance (if <0, this "
                  "defaults to the same as --left-context)");
-    po->Register("right-context-final", &right_context, "Number of frames "
-                 "of right context of input features that are added to each "
-                 "example at the end of the utterance (if <0, this "
+    po->Register("right-context-final", &right_context_final, "Number of "
+                 "frames of right context of input features that are added "
+                 "to each example at the end of the utterance (if <0, this "
                  "defaults to the same as --right-context)");
     po->Register("right-context", &right_context, "Number of frames of right "
                  "context of input features that are added to each "
@@ -143,6 +146,115 @@ struct ExampleExtractionConfig {
 
 
 
+/**
+   struct ChunkTimeInfo is used by class Utterane
+ */
+
+struct ChunkTimeInfo {
+  int32 first_frame;
+  int32 num_frames;
+  int32 left_context;
+  int32 right_context;
+};
+
+
+class UtteranceSplitter {
+
+  UtteranceSplitter(const ExampleExtractionConfig &config);
+
+
+  // Given an utterance length, this function creates for you a set of
+  // chunks into which to split the utterance.  Note: this is partly
+  // random (will call srand()).
+  void GetChunksForUtterance(int32 utterance_length,
+                             std::vector<ChunkTimeInfo> *chunk_info) const;
+
+
+ private:
+
+
+  void InitSplitForLength();
+
+
+  // Used in InitSplitForLength(), returns the maximum utterance-length considered
+  // separately in split_for_length_.  [above this, we'll assume that the additional
+  // length is consumed by multiples of the 'principal' chunk size.]  It returns
+  // the primary chunk-size (config_.num_frames[0]) plus twice the largest of
+  // any of the allowed chunk sizes (i.e. the max of config_.num_frames)
+  int32 MaxUtteranceLength() const;
+
+  // Used in InitSplitForLength(), this function outputs the set of allowed
+  // splits, represented as a sorted list of nonempty vectors (each split is a
+  // sorted list of chunk-sizes).
+  void InitSplits(std::vector<std::vector<int32> > *splits) const;
+
+
+  // Used in GetChunksForUtterance, this function selects the list of
+  // chunk-sizes for that utterance (later on, the positions and and left/right
+  // context information for the chunks will be added to this).  We don't call
+  // this a 'split', although it's also a list of chunk-sizes, because we
+  // randomize the order in which the chunk sizes appear, whereas for a 'split'
+  // we sort the chunk-sizes because a 'split' is conceptually an
+  // order-independent representation.
+  void GetChunkSizesForUtterance(int32 utterance_length,
+                                 std::vector<int32> *chunk_sizes) const;
+
+
+  // Used in GetChunksForUtterance, this function selects the 'gap sizes'
+  // before each of the chunks.  These 'gap sizes' may be positive (representing
+  // a gap between chunks, or a number of frames at the beginning of the file that
+  // don't correspond to a chunk), or may be negative, corresponding to overlaps
+  // between adjacent chunks.
+  //
+  // If config_.frame_subsampling_factor > 1 and enforce_subsampling_factor is
+  // true, this function will ensure that all elements of 'gap_sizes' are
+  // multiples of config_.frame_subsampling_factor.  (we always enforce this,
+  // but we set it to false inside a recursion when we recurse).  Note: if
+  // config_.frame_subsampling_factor > 1, it's possible for the last chunk to
+  // go over 'utterance_length' by up to config_.frame_subsampling_factor - 1
+  // frames (i.e. it would require that many frames past the utterance end).
+  // This will be dealt with when generating egs, by duplicating the last frame.
+  void GetGapSizes(int32 utterance_length,
+                   bool enforce_subsampling_factor,
+                   const std::vector<int32> &chunk_sizes,
+                   std::vector<int32> *gap_sizes) const;
+
+
+  // this static function, used in GetGapSizes(), writes values to
+  // a vector 'vec' such the sum of those values equals n.  It
+  // tries to make those values as similar as possible (they will
+  // differ by at most one), and the location of the larger versus
+  // smaller values is random.  n may be negative.  'vec' must be
+  // nonempty.
+  static void DistributeRandomly(int32 n,
+                                 std::vector<int32> *vec);
+
+
+  const ExampleExtractionConfig &config_;
+
+  // The vector 'split_for_length_' is indexed by the num-frames of a file, and
+  // gives us a list of alternative splits that we can use if the utternace has
+  // that many frames.  For example, if split_for_length[100] = ( (25, 40, 40),
+  // (40, 65) ), it means we could either split as chunks of size (25, 40, 40)
+  // or as (40, 65).  (we'll later randomize the order).  should use one chunk
+  // of size 25 and two chunks of size 40.  In general these won't add up to
+  // exactly the length of the utterance; we'll have them overlap (or have small
+  // gaps between them) to account for this, and the details of this will be
+  // randomly decided per file.  If splits_for_length_[u] is empty, it means the
+  // utterance was shorter than the smallest possible chunk size, so
+  // we will have to discard the utterance.
+
+  // If an utterance's num-frames is >= split_for_length.size(), the way to find
+  // the split to use is to keep subtracting the primary num-frames (==
+  // config_.num_frames[0]) from the utterance length until the resulting
+  // num-frames is < split_for_length_.size(), chunks, and then add the subtracted
+  // number of copies of the primary num-frames.
+  std::vector<std::vector<std::vector<int32> > > splits_for_length_;
+
+
+};
+
+
 void ComputeExampleTimeInfo(const ExampleExtractionConfig &config,
                             int32 num_frames_in_utt,
 
@@ -152,12 +264,6 @@ void ComputeExampleTimeInfo(const ExampleExtractionConfig &config,
 
 
 
-struct ExampleTimeInfo {
-  int32 first_frame;
-  int32 num_frames;
-  int32 left_context;
-  int32 right_context;
-};
 
 
 // This function rounds up the quantities 'num_frames' and 'num_frames_overlap'

From 4ae5e5306c1dd934bc8e32a607261d1c66be338b Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Fri, 23 Dec 2016 20:33:10 -0800
Subject: [PATCH 030/213] Refactoring the example-extraction for nnet3, for
 more flexibility in the num-frames for examples.  This code compiles but is
 not tested.

---
 src/chainbin/nnet3-chain-get-egs.cc          | 238 +++++--------
 src/nnet3/nnet-chain-example.cc              |   2 +-
 src/nnet3/nnet-chain-example.h               |   2 +-
 src/nnet3/nnet-discriminative-example.cc     |   4 +-
 src/nnet3/nnet-discriminative-example.h      |  28 +-
 src/nnet3/nnet-example-utils.cc              | 344 ++++++++++++++-----
 src/nnet3/nnet-example-utils.h               | 121 ++++---
 src/nnet3bin/nnet3-discriminative-get-egs.cc | 272 ++++++---------
 src/nnet3bin/nnet3-get-egs.cc                | 167 +++++----
 9 files changed, 655 insertions(+), 523 deletions(-)

diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 6f77a3c208b..2a8f5a1c6ad 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -41,149 +41,107 @@ namespace nnet3 {
 static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const chain::Supervision &supervision,
                         const std::string &utt_id,
                         bool compress,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
-                        int32 frames_overlap_per_eg,
-                        int32 frame_subsampling_factor,
+                        const UtteranceSplitter &utt_splitter,
                         int64 *num_frames_written,
                         int64 *num_egs_written,
                         NnetChainExampleWriter *example_writer) {
+  bool ans = true;
   KALDI_ASSERT(supervision.num_sequences == 1);
-  int32 num_feature_frames = feats.NumRows(),
-      num_output_frames = supervision.frames_per_sequence,
-      num_feature_frames_subsampled =
-                             (num_feature_frames + frame_subsampling_factor - 1)/
-                             frame_subsampling_factor;
-  if (num_output_frames != num_feature_frames_subsampled) {
-    // we tolerate deviations in the num-frames if they are very small (1 output
-    // frame).
-
-    if (abs(num_output_frames - num_feature_frames_subsampled) > 1) {
-      KALDI_ERR << "Mismatch in num-frames: chain supervision has "
-                << num_output_frames
-                << " versus features/frame_subsampling_factor = "
-                << num_feature_frames << " / " << frame_subsampling_factor
-                << " = " << num_feature_frames_subsampled
-                << ": check that --frame-subsampling-factor option is set "
-                << "the same as to chain-get-supervision.";
-    }
-    int32 new_num_feature_frames =
-        num_output_frames * frame_subsampling_factor;
-    // add a few frames at the end to make it match up.
-    Matrix<BaseFloat> feats_new(new_num_feature_frames, feats.NumCols(),
-                                kUndefined);
-    int32 min_feature_frames = std::min<int32>(num_feature_frames,
-                                               new_num_feature_frames);
-    feats_new.RowRange(0, min_feature_frames).CopyFromMat(
-        feats.RowRange(0, min_feature_frames));
-    for (int32 i = num_feature_frames; i < new_num_feature_frames; i++)
-      feats_new.Row(i).CopyFromVec(feats.Row(num_feature_frames - 1));
-    return ProcessFile(normalization_fst, feats_new, ivector_feats,
-                       supervision, utt_id, compress, left_context, right_context,
-                       frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor,
-                       num_frames_written, num_egs_written,
-                       example_writer);
-  }
+  int32 num_input_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence;
 
-  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
+  if (!utt_splitter.LengthsMatch(utt_id, num_input_frames, num_output_frames))
+    return false;  // LengthsMatch() will have printed a warning.
 
-  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
-      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
-      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
+  std::vector<ChunkTimeInfo> chunks;
 
-  if (num_feature_frames_subsampled < frames_per_eg_subsampled) {
-    KALDI_WARN << "Length of features for utterance " << utt_id
-               << " is less than than the frames_per_eg (after sub-sampling).";
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
     return false;
   }
 
-  // we don't do any padding, as it would be a bit tricky to pad the 'chain' supervision.
-  // Instead we select ranges of frames that fully fit within the file;  these
-  // might slightly overlap with each other or have gaps.
-  std::vector<int32> range_starts_subsampled;
-  chain::SplitIntoRanges(num_feature_frames_subsampled -
-                         frames_overlap_subsampled,
-                         frames_shift_subsampled,
-                         &range_starts_subsampled);
-  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
-  // that we tend to avoid having nonzero weights on the derivatives that are
-  // too close to the edge of the corresponding 'range' (these derivatives close
-  // to the edge are not as accurate as they could be, because when we split we
-  // don't know the correct alphas and betas).
-  std::vector<Vector<BaseFloat> > deriv_weights;
-
-  chain::GetWeightsForRanges(frames_per_eg_subsampled,
-                             range_starts_subsampled,
-                             &deriv_weights);
-
-  if (range_starts_subsampled.empty()) {
-    KALDI_WARN << "No output for utterance " << utt_id
-               << " (num-frames=" << num_feature_frames
-               << ") because too short for --frames-per-eg="
-               << frames_per_eg;
-    return false;
-  }
+  int32 frame_subsampling_factor = utt_splitter.Config().frame_subsampling_factor;
+
+  utt_splitter.GetChunksForUtterance(num_input_frames, &chunks);
+
   chain::SupervisionSplitter splitter(supervision);
 
-  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
-    int32 range_start_subsampled = range_starts_subsampled[i],
-        range_start = range_start_subsampled * frame_subsampling_factor;
+  for (size_t c = 0; c < chunks.size(); c++) {
+    ChunkTimeInfo &chunk = chunks[c];
+
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
 
     chain::Supervision supervision_part;
-    splitter.GetFrameRange(range_start_subsampled,
-                           frames_per_eg_subsampled,
+    splitter.GetFrameRange(start_frame_subsampled,
+                           num_frames_subsampled,
                            &supervision_part);
 
     if (normalization_fst.NumStates() > 0 &&
         !AddWeightToSupervisionFst(normalization_fst,
                                    &supervision_part)) {
-      KALDI_WARN << "For utterance " << utt_id << ", frames "
-                 << range_start << " to " << (range_start + frames_per_eg)
+      KALDI_WARN << "For utterance " << utt_id << ", feature frames "
+                 << chunk.first_frame << " to "
+                 << (chunk.first_frame + chunk.num_frames)
                  << ", FST was empty after composing with normalization FST. "
                  << "This should be extremely rare (a few per corpus, at most)";
-      return false;
+      ans = false;
     }
 
     int32 first_frame = 0;  // we shift the time-indexes of all these parts so
                             // that the supervised part starts from frame 0.
+
+    SubVector<BaseFloat> output_weights(
+        &(chunk.output_weights[0]),
+        static_cast<int32>(chunk.output_weights.size()));
+
     NnetChainSupervision nnet_supervision("output", supervision_part,
-                                          deriv_weights[i],
-                                          first_frame, frame_subsampling_factor);
+                                          output_weights,
+                                          first_frame,
+                                          frame_subsampling_factor);
 
     NnetChainExample nnet_chain_eg;
     nnet_chain_eg.outputs.resize(1);
     nnet_chain_eg.outputs[0].Swap(&nnet_supervision);
     nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
 
-    int32 tot_frames = left_context + frames_per_eg + right_context;
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
 
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
-      int32 t = range_start + j;
-      if (t < 0) t = 0;
-      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
-      SubVector<BaseFloat> src(feats, t),
-          dest(input_frames, j + left_context);
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
+
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
+      if (t2 < 0) t2 = 0;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
+      SubVector<BaseFloat> src(feats, t2),
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
-    NnetIo input_io("input", - left_context,
-                    input_frames);
+    NnetIo input_io("input", -chunk.left_context, input_frames);
     nnet_chain_eg.inputs[0].Swap(&input_io);
 
     if (ivector_feats != NULL) {
       // if applicable, add the iVector feature.
       // choose iVector from a random frame in the chunk
-      int32 ivector_frame = RandInt(range_start, range_start + frames_per_eg - 1);
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (ivector_frame >= ivector_feats->NumRows())
-        ivector_frame = ivector_feats->NumRows() - 1;
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       NnetIo ivector_io("ivector", 0, ivector);
       nnet_chain_eg.inputs[1].Swap(&ivector_io);
     }
@@ -192,16 +150,16 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
       nnet_chain_eg.Compress();
 
     std::ostringstream os;
-    os << utt_id << "-" << range_start;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += frames_per_eg;
+    *num_frames_written += chunk.num_frames;
     *num_egs_written += 1;
 
     example_writer->Write(key, nnet_chain_eg);
   }
-  return true;
+  return ans;
 }
 
 } // namespace nnet2
@@ -233,35 +191,28 @@ int main(int argc, char *argv[]) {
         "chain-get-supervision.\n";
 
     bool compress = true;
-    int32 left_context = 0, right_context = 0, num_frames = 1,
-        num_frames_overlap = 0, length_tolerance = 100,
-        cut_zero_frames = -1,
-        frame_subsampling_factor = 1;
+    int32 length_tolerance = 100, online_ivector_period = 1;
+
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
 
     int32 srand_seed = 0;
-    std::string ivector_rspecifier;
+    std::string online_ivector_rspecifier;
 
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs in "
-                "compressed format (recommended)");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.  Will be rounded up to a multiple "
-                "of --frame-subsampling-factor.");
-    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
-                "overlap between each example.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
-                "features, as a matrix.");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --pick-random-ivector=true)");
+                "compressed format.");
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
+                "--online-ivectors option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
+                "ivector features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                "if the frame-rate at the output will be less than the "
-                "frame-rate of the input");
+    eg_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -272,12 +223,6 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
 
-    if (num_frames <= 0 || left_context < 0 || right_context < 0 ||
-        length_tolerance < 0 || frame_subsampling_factor <= 0)
-      KALDI_ERR << "One of the integer options is out of the allowed range.";
-    RoundUpNumFrames(frame_subsampling_factor,
-                     &num_frames, &num_frames_overlap);
-
     std::string
         normalization_fst_rxfilename,
         feature_rspecifier,
@@ -295,6 +240,9 @@ int main(int argc, char *argv[]) {
       examples_wspecifier = po.GetArg(4);
     }
 
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
+
     fst::StdVectorFst normalization_fst;
     if (!normalization_fst_rxfilename.empty()) {
       ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst);
@@ -305,7 +253,8 @@ int main(int argc, char *argv[]) {
     chain::RandomAccessSupervisionReader supervision_reader(
         supervision_rspecifier);
     NnetChainExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
 
     int32 num_done = 0, num_err = 0;
     int64 num_frames_written = 0, num_egs_written = 0;
@@ -318,31 +267,32 @@ int main(int argc, char *argv[]) {
         num_err++;
       } else {
         const chain::Supervision &supervision = supervision_reader.Value(key);
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
-        if (ivector_feats != NULL &&
-            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
+                                    online_ivector_period)) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
-                     << " exceeds tolerance " << length_tolerance;
+                     << " and iVectors " << online_ivector_feats->NumRows()
+                     << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
-        if (ProcessFile(normalization_fst, feats, ivector_feats, supervision,
-                        key, compress,
-                        left_context, right_context, num_frames,
-                        num_frames_overlap, frame_subsampling_factor,
+
+        if (ProcessFile(normalization_fst, feats,
+                        online_ivector_feats, online_ivector_period,
+                        supervision, key, compress, utt_splitter,
                         &num_frames_written, &num_egs_written,
                         &example_writer))
           num_done++;
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 74e8be80240..0607543b743 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -105,7 +105,7 @@ void NnetChainSupervision::Swap(NnetChainSupervision *other) {
 NnetChainSupervision::NnetChainSupervision(
     const std::string &name,
     const chain::Supervision &supervision,
-    const Vector<BaseFloat> &deriv_weights,
+    const VectorBase<BaseFloat> &deriv_weights,
     int32 first_frame,
     int32 frame_skip):
     name(name),
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 323e73da8da..9be298074a4 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -83,7 +83,7 @@ struct NnetChainSupervision {
   /// is slower than the input, so in this case it might be 2 or 3.
   NnetChainSupervision(const std::string &name,
                        const chain::Supervision &supervision,
-                       const Vector<BaseFloat> &deriv_weights,
+                       const VectorBase<BaseFloat> &deriv_weights,
                        int32 first_frame,
                        int32 frame_skip);
 
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index e9a063e268e..5c02998cbcf 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -91,7 +91,7 @@ NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(const NnetDiscrimin
 NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(
     const std::string &name,
     const discriminative::DiscriminativeSupervision &supervision,
-    const Vector<BaseFloat> &deriv_weights,
+    const VectorBase<BaseFloat> &deriv_weights,
     int32 first_frame,
     int32 frame_skip):
     name(name),
@@ -347,7 +347,7 @@ void GetDiscriminativeComputationRequest(const Nnet &nnet,
     io_spec.name = name;
     io_spec.indexes = sup.indexes;
     io_spec.has_deriv = need_model_derivative;
-    
+
     if (use_xent_regularization) {
       size_t cur_size = request->outputs.size();
       request->outputs.resize(cur_size + 1);
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index b2458b0cdcd..bb60f216a82 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -42,7 +42,7 @@ struct NnetDiscriminativeSupervision {
   // the name of the output in the neural net; in simple setups it
   // will just be "output".
   std::string name;
-  
+
   // The indexes that the output corresponds to.  The size of this vector will
   // be equal to supervision.num_sequences * supervision.frames_per_sequence.
   // Be careful about the order of these indexes-- it is a little confusing.
@@ -52,7 +52,7 @@ struct NnetDiscriminativeSupervision {
   // This is done to make the code similar that for the 'chain' model.
   std::vector<Index> indexes;
 
-  // The supervision object, containing the numerator and denominator 
+  // The supervision object, containing the numerator and denominator
   // lattices.
   discriminative::DiscriminativeSupervision supervision;
 
@@ -68,19 +68,19 @@ struct NnetDiscriminativeSupervision {
   // so it's equivalent to a vector of all ones.  This vector is written
   // to disk compactly as unsigned char.
   Vector<BaseFloat> deriv_weights;
-  
+
   // Use default assignment operator
   NnetDiscriminativeSupervision() { }
 
   // Initialize the object from an object of type discriminative::Supervision,
-  // and some extra information.  
+  // and some extra information.
   // Note: you probably want to set 'name' to "output".
   // 'first_frame' will often be zero but you can choose (just make it
   // consistent with how you numbered your inputs), and 'frame_skip' would be 1
   // in a vanilla setup, but 3 in the case of 'chain' models
   NnetDiscriminativeSupervision(const std::string &name,
                                 const discriminative::DiscriminativeSupervision &supervision,
-                                const Vector<BaseFloat> &deriv_weights,
+                                const VectorBase<BaseFloat> &deriv_weights,
                                 int32 first_frame,
                                 int32 frame_skip);
 
@@ -89,15 +89,15 @@ struct NnetDiscriminativeSupervision {
   void Write(std::ostream &os, bool binary) const;
 
   void Read(std::istream &is, bool binary);
-  
+
   void Swap(NnetDiscriminativeSupervision *other);
 
   void CheckDim() const;
-  
+
   bool operator == (const NnetDiscriminativeSupervision &other) const;
 };
 
-/// NnetDiscriminativeExample is like NnetExample, but specialized for 
+/// NnetDiscriminativeExample is like NnetExample, but specialized for
 /// sequence training.
 struct NnetDiscriminativeExample {
 
@@ -111,7 +111,7 @@ struct NnetDiscriminativeExample {
   std::vector<NnetDiscriminativeSupervision> outputs;
 
   void Write(std::ostream &os, bool binary) const;
-  
+
   void Read(std::istream &is, bool binary);
 
   void Swap(NnetDiscriminativeExample *other);
@@ -128,10 +128,10 @@ struct NnetDiscriminativeExample {
   }
 };
 
-/** 
-  Appends the given vector of examples (which must be non-empty) into 
+/**
+  Appends the given vector of examples (which must be non-empty) into
   a single output example.
-  Intended to be used when forming minibatches for neural net training. If 
+  Intended to be used when forming minibatches for neural net training. If
   'compress' it compresses the output features (recommended to save disk
   space).
 
@@ -149,7 +149,7 @@ void MergeDiscriminativeExamples(
 
 void MergeSupervision(
     const std::vector<const NnetDiscriminativeSupervision*> &inputs,
-    NnetDiscriminativeSupervision *output); 
+    NnetDiscriminativeSupervision *output);
 
 
 /** Shifts the time-index t of everything in the input of "eg" by adding
@@ -179,7 +179,7 @@ void ShiftDiscriminativeExampleTimes(int32 frame_shift,
 void TruncateDerivWeights(int32 truncate,
                           NnetDiscriminativeExample *eg);
 
-/**  This function takes a NnetDiscriminativeExample and produces a 
+/**  This function takes a NnetDiscriminativeExample and produces a
      ComputationRequest.
      Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
      can create the ComputationRequest manually.  Assumes that if
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 547c70578ab..dc9dedefe43 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -21,6 +21,8 @@
 #include "nnet3/nnet-example-utils.h"
 #include "lat/lattice-functions.h"
 #include "hmm/posterior.h"
+#include "util/text-utils.h"
+#include <numeric>
 
 namespace kaldi {
 namespace nnet3 {
@@ -282,10 +284,73 @@ void RoundUpNumFrames(int32 frame_subsampling_factor,
     KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
               << "--num-frames=" << (*num_frames);
   }
+}
+
+void ExampleGenerationConfig::ComputeDerived() {
+  if (!SplitStringToIntegers(num_frames_str, ",", false, &num_frames) ||
+      num_frames.empty()) {
+    KALDI_ERR << "Invalid option (expected comma-separated list of integers): "
+              << "--num-frames=" << num_frames_str;
+  }
 
+  int32 m = frame_subsampling_factor;
+  if (m < 1) {
+    KALDI_ERR << "Invalid value --frame-subsampling-factor=" << m;
+  }
+  bool changed = false;
+  for (size_t i = 0; i < num_frames.size(); i++) {
+    int32 value = num_frames[i];
+    if (value <= 0) {
+      KALDI_ERR << "Invalid option --num-frames=" << num_frames_str;
+    }
+    if (value % m != 0) {
+      value = m * ((value / m) + 1);
+      changed = true;
+    }
+    num_frames[i] = value;
+  }
+  if (changed) {
+    std::ostringstream rounded_num_frames_str;
+    for (size_t i = 0; i < num_frames.size(); i++) {
+      if (i > 0)
+        rounded_num_frames_str << ',';
+      rounded_num_frames_str << num_frames[i];
+    }
+    KALDI_LOG << "Rounding up --num-frames=" << num_frames_str
+              << " to multiples of --frame-subsampling-factor=" << m
+              << ", to: " << rounded_num_frames_str;
+  }
 }
 
 
+UtteranceSplitter::UtteranceSplitter(const ExampleGenerationConfig &config):
+    config_(config) {
+  if (config.num_frames.empty()) {
+    KALDI_ERR << "You need to call ComputeDerived() on the "
+                 "ExampleGenerationConfig().";
+  }
+  InitSplitForLength();
+}
+
+float UtteranceSplitter::DefaultDurationOfSplit(
+    const std::vector<int32> &split) const {
+  if (split.empty())  // not a valid split, but useful to handle this case.
+    return 0.0;
+  float principal_num_frames = config_.num_frames[0],
+      num_frames_overlap = config_.num_frames_overlap;
+  KALDI_ASSERT(num_frames_overlap < principal_num_frames &&
+               "--num-frames-overlap value is too high");
+  float overlap_proportion = num_frames_overlap / principal_num_frames;
+  float ans = std::accumulate(split.begin(), split.end(), int32(0));
+  for (size_t i = 0; i + 1 < split.size(); i++) {
+    float min_adjacent_chunk_length = std::min(split[i], split[i + 1]),
+        overlap = overlap_proportion * min_adjacent_chunk_length;
+    ans -= overlap;
+  }
+  KALDI_ASSERT(ans > 0.0);
+  return ans;
+}
+
 /*
   This comment describes the idea behind what InitChunkSize() is supposed to do,
   and how it relates to the purpose of class UtteranceSplitter.
@@ -293,29 +358,31 @@ void RoundUpNumFrames(int32 frame_subsampling_factor,
   Class UtteranceSplitter is supposed to tell us, for a given utterance length,
   what chunk sizes to use.  The chunk sizes it may choose are:
     - zero or more chunks of the 'principal' size (the first-listed value in
-      num-frames)
-    - at most two chunks of 'alternative' num-frames (any but the first-listed
-      num-frames).
-
-  (and an empty list of chunks is not allowed as a split).  A split is
-  effectively a multiset of chunk-sizes (the order will be randomized by the
-  caller).  We represent it in code as a list of chunk-sizes, represented as a
-  std::vector, which is sorted to get a unique representation without repeats of
-  different orderings.
-
-  The choice of spilt is determined by a cost-function that depends on the sum
-  of the chunk-sizes in the split and the length of the utterance: the idea is
-  that we want the sum of chunk-sizes in the split to be as close as possible to
-  the utterance length.  The cost-function penalizes the sum of chunk-sizes
-  being smaller than the utterance-length (leading to gaps) twice as much as
-  when the sum of chunk-sizes is larger than the utterance length.  I.e.
-    cost(chunk_size_sum, utt_length) = (chunk_size_sum > utt_length ?
-                                         chunk_size_sum - utt_length :
-                                         2 * (utt_length - chunk_size_sum))
+      --num-frames option)
+    - at most two chunks of 'alternative' num-frames (meaning, any but the
+      first-listed choice in the --num-frames option).
+
+  (note: an empty list of chunks is not allowed as a split).  A split is
+  a list of chunk-sizes in increasing order (we when we actually split the
+  utterance into chunks, we may, at random, reverse the order.
+
+  The choice of split to use for a given utterance-length is determined as
+  follows.  Firstly, for each split we compute a 'default duration' (see
+  DefaultDurationOfSplit()... if --num-frames-overlap is zero, this is just the
+  sum of the chunk sizes).  We then use by a cost-function that depends on
+  default-duration and the length of the utterance: the idea is that these two
+  should be as close as possible, but penalizing the default-duration being
+  larger than the utterance-length (which in the normal case of
+  --num-frames-overlap=0 would lead to gaps between the segments), twice as much
+  as the other sign of difference.
+
+  Specifically:
+    cost(default_duration, utt_length) = (default_duration > utt_length ?
+                                         default_duration - utt_length :
+                                         2.0 * (utt_length - default_duration))
   [but as a special case, set c to infinity if the largest chunk size in the
-  split is longer than the utterance length; we couldn't, in that case, use
-  this split for this utterance].
-
+   split is longer than the utterance length; we couldn't, in that case, use
+   this split for this utterance].
 
   We want to make sure a good variety of combinations of chunk sizes are chosen
   in case there are ties from the cost function.  For each utterance length
@@ -324,11 +391,11 @@ void RoundUpNumFrames(int32 frame_subsampling_factor,
   chunks for a particular utterance of that length, we will choose randomly
   from that pool of splits.
  */
-void UtteranceSplitter::InitChunkSize() {
+void UtteranceSplitter::InitSplitForLength() {
   int32 max_utterance_length = MaxUtteranceLength();
 
   // The 'splits' vector is a list of possible splits (a split being
-  // a multiset of chunk-sizes, represented as a sorted vector).
+  // a sorted vector of chunk-sizes).
   // The vector 'splits' is itself sorted.
   std::vector<std::vector<int32> > splits;
   InitSplits(&splits);
@@ -338,9 +405,9 @@ void UtteranceSplitter::InitChunkSize() {
   // vector, and let a cost c >= 0 represent the mismatch between an
   // utterance length and the total length of the chunk sizes in a split:
 
-  //  c(chunk_size_sum, utt_length) = (chunk_size_sum > utt_length ?
-  //                                    chunk_size_sum - utt_length :
-  //                                    2 * (utt_length - chunk_size_sum))
+  //  c(default_duration, utt_length) = (default_duration > utt_length ?
+  //                                    default_duration - utt_length :
+  //                                    2.0 * (utt_length - default_duration))
   // [but as a special case, set c to infinity if the largest chunk size in the
   //  split is longer than the utterance length; we couldn't, in that case, use
   //  this split for this utterance].
@@ -348,52 +415,51 @@ void UtteranceSplitter::InitChunkSize() {
   // 'costs_for_length[u][s]', indexed by utterance-length u and then split,
   // contains the cost for utterance-length u and split s.
 
-  std::vector<std::vector<int32> > costs_for_length(
+  std::vector<std::vector<float> > costs_for_length(
       max_utterance_length + 1);
   int32 num_splits = splits.size();
 
-
   for (int32 u = 0; u <= max_utterance_length; u++)
-    pairs_for_length[u].reserve(num_splits);
+    costs_for_length[u].reserve(num_splits);
 
   for (int32 s = 0; s < num_splits; s++) {
     const std::vector<int32> &split = splits[s];
-    int32 chunk_size_sum = std::accumulate(split.begin(), split.end(),
-                                           int32(0)),
-        max_chunk_size = *std::max_element(split.begin(), split.end());
+    float default_duration = DefaultDurationOfSplit(split);
+    int32 max_chunk_size = *std::max_element(split.begin(), split.end());
     for (int32 u = 0; u <= max_utterance_length; u++) {
       // c is the cost for this utterance length and this split.  We penalize
       // gaps twice as strongly as overlaps, based on the intuition that
       // completely throwing out frames of data is worse than counting them
-      // twice.  It might be possible to come up with some kind of mathematical
-      // justification for this based on variance of the estimated gradient.
-      int32 c = (chunk_size_sum > u ? chunk_size_sum - u :
-                 2 * (u - chunk_size_sum));
-      if (max_chunk_size > u)
-        c = std::numeric_limits<int32>::max();
-      pairs_for_length[u].push_back(c);
+      // twice.
+      int32 c = (default_duration > float(u) ? default_duration - u :
+                 2 * (u - default_duration));
+      if (u < max_chunk_size)
+        c = std::numeric_limits<float>::max();
+      costs_for_length[u].push_back(c);
     }
   }
 
 
   splits_for_length_.resize(max_utterance_length + 1);
 
-
   for (int32 u = 0; u <= max_utterance_length; u++) {
-    const std::vector<int32> &costs = costs_for_length[u];
-    int32 min_cost = std::min_element(costs.begin(), costs.end());
-    if (min_cost == std::numeric_limits<int32>::max()) {
+    const std::vector<float> &costs = costs_for_length[u];
+    float min_cost = *std::min_element(costs.begin(), costs.end());
+    if (min_cost == std::numeric_limits<float>::max()) {
       // All costs were infinity, becaues this utterance-length u is shorter
       // than the smallest chunk-size.  Leave splits_for_length_[u] as empty
       // for this utterance-length, meaning we will not be able to choose any
       // split, and such utterances will be discarded.
       continue;
     }
-    int32 cost_threshold = 2;  // We will choose pseudo-randomly from splits
-                               // that are within this distance from the best
-                               // cost.
+    float cost_threshold = 1.9999; // We will choose pseudo-randomly from splits
+                                   // that are within this distance from the
+                                   // best cost.  Make the threshold just
+                                   // slightly less than 2...  this will
+                                   // hopefully make the behavior more
+                                   // deterministic for ties.
     std::vector<int32> possible_splits;
-    std::vector<int32>::const_iterator iter = costs.begin(), end = costs.end();
+    std::vector<float>::const_iterator iter = costs.begin(), end = costs.end();
     int32 s = 0;
     for (; iter != end; ++iter,++s)
       if (*iter < min_cost + cost_threshold)
@@ -429,19 +495,45 @@ void UtteranceSplitter::InitChunkSize() {
 }
 
 
-void GetChunkSizesForUtterance(int32 utterance_length,
-                               std::vector<int32> *chunk_sizes) const {
-  KALDI_ASSERT(!splits_for_length.empty());
+bool UtteranceSplitter::LengthsMatch(const std::string &utt,
+                                     int32 utterance_length,
+                                     int32 supervision_length) const {
+  int32 sf = config_.frame_subsampling_factor,
+      expected_supervision_length = (utterance_length + sf - 1) / sf;
+  if (supervision_length == expected_supervision_length) {
+    return true;
+  } else {
+    if (sf == 1) {
+      KALDI_WARN << "Supervision does not have expected length for utterance "
+                 << utt << ": expected length = " << utterance_length
+                 << ", got " << supervision_length;
+    } else {
+      KALDI_WARN << "Supervision does not have expected length for utterance "
+                 << utt << ": expected length = (" << utterance_length
+                 << " + " << sf << " - 1) / " << sf << " = "
+                 << expected_supervision_length
+                 << ", got: " << supervision_length
+                 << " (note: --frame-subsampling-factor=" << sf << ")";
+    }
+    return false;
+  }
+}
+
+
+void UtteranceSplitter::GetChunkSizesForUtterance(
+    int32 utterance_length, std::vector<int32> *chunk_sizes) const {
+  KALDI_ASSERT(!splits_for_length_.empty());
   // 'primary_length' is the first-specified num-frames.
   // It's the only chunk that may be repeated an arbitrary number
   // of times.
   int32 primary_length = config_.num_frames[0],
+      num_frames_overlap = config_.num_frames_overlap,
       max_tabulated_length = splits_for_length_.size() - 1,
       num_primary_length_repeats = 0;
-
+  KALDI_ASSERT(primary_length - num_frames_overlap > 0);
   KALDI_ASSERT(utterance_length >= 0);
   while (utterance_length > max_tabulated_length) {
-    utterance_length -= primary_length;
+    utterance_length -= (primary_length - num_frames_overlap);
     num_primary_length_repeats++;
   }
   KALDI_ASSERT(utterance_length >= 0);
@@ -452,9 +544,11 @@ void GetChunkSizesForUtterance(int32 utterance_length,
   *chunk_sizes = possible_splits[randomly_chosen_split];
   for (int32 i = 0; i < num_primary_length_repeats; i++)
     chunk_sizes->push_back(primary_length);
-  // Randomize the order in which the chunks appear.
-  std::random_shuffle(chunk_sizes->begin(),
-                      chunk_sizes->end());
+
+  std::sort(chunk_sizes->begin(), chunk_sizes->end());
+  if (RandInt(0, 1) == 0) {
+    std::reverse(chunk_sizes->begin(), chunk_sizes->end());
+  }
 }
 
 
@@ -474,14 +568,15 @@ int32 UtteranceSplitter::MaxUtteranceLength() const {
 }
 
 void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) const {
-  // we consider splits whose total length is up to MaxUtteranceLength() +
-  // primary_length.  We can be confident without doing a lot of math, that
-  // multisets above this length will never be chosen for any utterance-length
-  // up to MaxUtteranceLength().
+  // we consider splits whose default duration (as returned by
+  // DefaultDurationOfSplit()) is up to MaxUtteranceLength() + primary_length.
+  // We can be confident without doing a lot of math, that splits above this
+  // length will never be chosen for any utterance-length up to
+  // MaxUtteranceLength() (which is the maximum we use).
   int32 primary_length = config_.num_frames[0],
-      length_ceiling = MaxUtteranceLength() + primary_length;
+      default_duration_ceiling = MaxUtteranceLength() + primary_length;
 
-  typedef std::unordered_set<std::vector<int32>, VectorHasher<int32> > SetType;
+  typedef unordered_set<std::vector<int32>, VectorHasher<int32> > SetType;
 
   SetType splits_set;
 
@@ -490,24 +585,23 @@ void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) con
   // The splits we are allow are: zero to two 'alternate' lengths, plus
   // an arbitrary number of repeats of the 'primary' length.  The repeats
   // of the 'primary' length are handled by the inner loop over n.
-  // The zero two two 'alternate' lengths are handled by the loops over
+  // The zero to two 'alternate' lengths are handled by the loops over
   // i and j.  i == 0 and j == 0 are special cases; they mean, no
   // alternate is chosen.
   for (int32 i = 0; i < num_lengths; i++) {
-    for (int32 j = 0; j < num_length; j++) {
+    for (int32 j = 0; j < num_lengths; j++) {
       std::vector<int32> vec;
       if (i > 0)
         vec.push_back(config_.num_frames[i]);
       if (j > 0)
         vec.push_back(config_.num_frames[j]);
-      for (int32 n = 0;
-           std::accumulate(vec.begin(), vec.end(), int32(0)) <= length_ceiling;
-           ++n, vec.push_back(primary_length)) {
-        std::sort(vec.begin(), vec.end());  // we don't want to treat different
-                                            // orderings of the same values as
-                                            // different, so sort them.
+      int32 n = 0;
+      while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) {
         if (!vec.empty()) // Don't allow the empty vector as a split.
           splits_set.insert(vec);
+        n++;
+        vec.push_back(primary_length);
+        std::sort(vec.begin(), vec.end());
       }
     }
   }
@@ -521,11 +615,11 @@ void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) con
 
 
 // static
-void UtteranceSplitter::DistributeRandomly(int32 n, std::vector<int32> *vec) {
+void UtteranceSplitter::DistributeRandomlyUniform(int32 n, std::vector<int32> *vec) {
   KALDI_ASSERT(!vec->empty());
   int32 size = vec->size();
   if (n < 0) {
-    DistributeRandomly(n, vec);
+    DistributeRandomlyUniform(-n, vec);
     for (int32 i = 0; i < size; i++)
       (*vec)[i] *= -1;
     return;
@@ -544,6 +638,48 @@ void UtteranceSplitter::DistributeRandomly(int32 n, std::vector<int32> *vec) {
 }
 
 
+// static
+void UtteranceSplitter::DistributeRandomly(int32 n,
+                                           const std::vector<int32> &magnitudes,
+                                           std::vector<int32> *vec) {
+  KALDI_ASSERT(!vec->empty() && vec->size() == magnitudes.size());
+  int32 size = vec->size();
+  if (n < 0) {
+    DistributeRandomly(-n, magnitudes, vec);
+    for (int32 i = 0; i < size; i++)
+      (*vec)[i] *= -1;
+    return;
+  }
+  float total_magnitude = std::accumulate(magnitudes.begin(), magnitudes.end(),
+                                          int32(0));
+  KALDI_ASSERT(total_magnitude > 0);
+  // note: 'partial_counts' contains the negative of the partial counts, so
+  // when we sort the larger partial counts come first.
+  std::vector<std::pair<float, int32> > partial_counts;
+  int32 total_count = 0;
+  for (int32 i = 0; i < size; i++) {
+    float this_count = float(n) / total_magnitude;
+    // note: cast of float to int32 rounds towards zero (down, in this
+    // case, since this_count >= 0).
+    int32 this_whole_count = static_cast<int32>(this_count),
+        this_partial_count = this_count - this_whole_count;
+    (*vec)[i] = this_whole_count;
+    total_count += this_whole_count;
+    partial_counts.push_back(std::pair<float, int32>(-this_partial_count, i));
+  }
+  KALDI_ASSERT(total_count <= n && total_count + size >= n);
+  std::sort(partial_counts.begin(), partial_counts.end());
+  int32 i = 0;
+  // Increment by one the elements of the vector that has the largest partial
+  // count, then the next largest partial count, and so on... until we reach the
+  // desired total-count 'n'.
+  for(; total_count < n; i++,total_count++) {
+    (*vec)[partial_counts[i].second]++;
+  }
+  KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
+}
+
+
 void UtteranceSplitter::GetGapSizes(int32 utterance_length,
                                     bool enforce_subsampling_factor,
                                     const std::vector<int32> &chunk_sizes,
@@ -552,7 +688,7 @@ void UtteranceSplitter::GetGapSizes(int32 utterance_length,
     gap_sizes->clear();
     return;
   }
-  if (enforce_subsamping_factor && config_.frame_subsampling_factor > 1) {
+  if (enforce_subsampling_factor && config_.frame_subsampling_factor > 1) {
     int32 sf = config_.frame_subsampling_factor, size = chunk_sizes.size();
     int32 utterance_length_reduced = (utterance_length + (sf - 1)) / sf;
     std::vector<int32> chunk_sizes_reduced(chunk_sizes);
@@ -576,7 +712,9 @@ void UtteranceSplitter::GetGapSizes(int32 utterance_length,
 
   if (total_gap < 0) {
     // there is an overlap.  Overlaps can only go between chunks, not at the
-    // beginning or end of the utterance.
+    // beginning or end of the utterance.  Also, we try to make the length of
+    // overlap proportional to the size of the smaller of the two chunks
+    // that the overlap is between.
     if (num_chunks == 1) {
       // there needs to be an overlap, but there is only one chunk... this means
       // the chunk-size exceeds the utterance length, which is not allowed.
@@ -586,16 +724,32 @@ void UtteranceSplitter::GetGapSizes(int32 utterance_length,
     }
 
     // note the elements of 'overlaps' will be <= 0.
-    std::vector<int32> overlaps(num_chunks - 1);
-    DistributeRandomly(total_gap, &num_overlap_locations);
+    std::vector<int32> magnitudes(num_chunks - 1),
+        overlaps(num_chunks - 1);
+    // the 'magnitudes' vector will contain the minimum of the lengths of the
+    // two adjacent chunks between which are are going to consider having an
+    // overlap.  These will be used to assign the overlap proportional to that
+    // size.
+    for (int32 i = 0; i + 1 < num_chunks; i++) {
+      magnitudes[i] = std::min<int32>(chunk_sizes[i], chunk_sizes[i + 1]);
+    }
+    DistributeRandomly(total_gap, magnitudes, &overlaps);
+    for (int32 i = 0; i + 1 < num_chunks; i++) {
+      // If the following condition does not hold, it's possible we
+      // could get chunk start-times less than zero.  I don't believe
+      // it's possible for this condition to fail, but we're checking
+      // for it at this level to make debugging easier, just in case.
+      KALDI_ASSERT(overlaps[i] <= magnitudes[i]);
+    }
+
     (*gap_sizes)[0] = 0;  // no gap before 1st chunk.
     for (int32 i = 1; i < num_chunks; i++)
       (*gap_sizes)[i] = overlaps[i-1];
   } else {
     // There may be a gap.  Gaps can go at the start or end of the utterance, or
-    // between segments.
+    // between segments.  We try to distribute the gaps evenly.
     std::vector<int32> gaps(num_chunks + 1);
-    DistributeRandomly(total_gap, &gaps);
+    DistributeRandomlyUniform(total_gap, &gaps);
     // the last element of 'gaps', the one at the end of the utterance, is
     // implicit and doesn't have to be written to the output.
     for (int32 i = 0; i < num_chunks; i++)
@@ -610,7 +764,7 @@ void UtteranceSplitter::GetChunksForUtterance(
   std::vector<int32> chunk_sizes;
   GetChunkSizesForUtterance(utterance_length, &chunk_sizes);
   std::vector<int32> gaps(chunk_sizes.size());
-  GetGapSizes(utterance_length, true, chunk_sizes, &gap_sizes);
+  GetGapSizes(utterance_length, true, chunk_sizes, &gaps);
   int32 num_chunks = chunk_sizes.size();
   chunk_info->resize(num_chunks);
   int32 t = 0;
@@ -622,7 +776,7 @@ void UtteranceSplitter::GetChunksForUtterance(
     info.left_context = (i == 0 && config_.left_context_initial >= 0 ?
                          config_.left_context_initial : config_.left_context);
     info.right_context = (i == 0 && config_.right_context_final >= 0 ?
-                         config_.right_context_final : config_.right_context);
+                          config_.right_context_final : config_.right_context);
     t += chunk_sizes[i];
   }
   // check that the end of the last chunk doesn't go more than
@@ -631,5 +785,35 @@ void UtteranceSplitter::GetChunksForUtterance(
   KALDI_ASSERT(t - utterance_length < config_.frame_subsampling_factor);
 }
 
+void UtteranceSplitter::SetOutputWeights(
+    int32 utterance_length,
+    std::vector<ChunkTimeInfo> *chunk_info) const {
+  int32 sf = config_.frame_subsampling_factor;
+  int32 num_output_frames = (utterance_length + sf - 1) / sf;
+  // num_output_frames is the number of frames of supervision.  'count[t]' will
+  // be the number of chunks that this output-frame t appears in.  Note: the
+  // 'first_frame' and 'num_frames' members of ChunkTimeInfo will always be
+  // multiples of frame_subsampling_factor.
+  std::vector<int32> count(num_output_frames, 0);
+  int32 num_chunks = chunk_info->size();
+  for (int32 i = 0; i < num_chunks; i++) {
+    ChunkTimeInfo &chunk = (*chunk_info)[i];
+    for (int32 t = chunk.first_frame / sf;
+         t < (chunk.first_frame + chunk.num_frames) / sf;
+         t++)
+      count[t]++;
+  }
+  for (int32 i = 0; i < num_chunks; i++) {
+    ChunkTimeInfo &chunk = (*chunk_info)[i];
+    chunk.output_weights.resize(chunk.num_frames / sf);
+    int32 t_start = chunk.first_frame / sf;
+    for (int32 t = t_start;
+         t < (chunk.first_frame + chunk.num_frames) / sf;
+         t++)
+      chunk.output_weights[t - t_start] = 1.0 / count[t];
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index d02aa336a10..754743d581e 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -78,12 +78,13 @@ void ReadVectorAsChar(std::istream &is,
 // Warning: after reading in the values from the command line
 // (Register() and then then po.Read()), you should then call ComputeDerived()
 // to set up the 'derived values' (parses 'num_frames_str').
-struct ExampleExtractionConfig {
+struct ExampleGenerationConfig {
   int32 left_context;
   int32 right_context;
   int32 left_context_initial;
   int32 right_context_final;
   int32 num_frames_overlap;
+  int32 frame_subsampling_factor;
   std::string num_frames_str;
 
 
@@ -95,16 +96,14 @@ struct ExampleExtractionConfig {
   // frames, to be used at most once or twice per file.
   std::vector<int32> num_frames;
 
-  ExampleExtractionConfig():
+  ExampleGenerationConfig():
       left_context(0), right_context(0),
-      left_context_initial(-1), right_context_initial(-1),
-      num_frames_overlap(0),
+      left_context_initial(-1), right_context_final(-1),
+      num_frames_overlap(0), frame_subsampling_factor(1),
       num_frames_str("1") { }
 
-  /// This function decodes 'num_frames_str' into 'num_frames' and 'num_frames_alternatives',
-  /// and ensures that 'num_frames', and the members of num_frames_alternatives' are
-  /// multiples of 'frame_subsampling_factor'.
-  ///
+  /// This function decodes 'num_frames_str' into 'num_frames', and ensures that
+  /// the members of 'num_frames' are multiples of 'frame_subsampling_factor'.
   void ComputeDerived();
 
   void Register(OptionsItf *po) {
@@ -135,19 +134,22 @@ struct ExampleExtractionConfig {
                 "deal with odd-sized inputs we may also generate egs with these "
                 "other sizes.  All these values will be rounded up to the "
                 "closest multiple of --frame-subsampling-factor.");
-    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
-                "overlap between adjacent examples (advisory, will not be "
-                "exactly enforced)");
-    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                "if the frame-rate of the output labels in the generated "
-                "examples will be less than the frame-rate at the input");
+    po->Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
+                 "overlap between adjacent eamples (applies to chunks of size "
+                 "equal to the primary [first-listed] --num-frames value... "
+                 "will be adjusted for different-sized chunks).  Advisory; "
+                 "will not be exactly enforced.");
+    po->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                 "if the frame-rate of the output labels in the generated "
+                 "examples will be less than the frame-rate at the input");
   }
 };
 
 
 
 /**
-   struct ChunkTimeInfo is used by class Utterane
+   struct ChunkTimeInfo is used by class UtteranceSplitter to output
+   information about how we split an utterance into chunks.
  */
 
 struct ChunkTimeInfo {
@@ -155,26 +157,54 @@ struct ChunkTimeInfo {
   int32 num_frames;
   int32 left_context;
   int32 right_context;
+  // The 'output_weights' member is a vector of length equal to the
+  // num_frames divided by frame_subsampling_factor from the config.
+  // It contains values 0 < x <= 1 that represent weightings of
+  // output-frames.  The idea is that if (because of overlaps) a
+  // frame appears in multiple chunks, we want to downweight it
+  // so that the total weight remains 1.  (Of course, the calling
+  // code is free to ignore these weights if desired).
+  std::vector<float> output_weights;
 };
 
 
 class UtteranceSplitter {
+ public:
 
-  UtteranceSplitter(const ExampleExtractionConfig &config);
+  UtteranceSplitter(const ExampleGenerationConfig &config);
 
 
-  // Given an utterance length, this function creates for you a set of
-  // chunks into which to split the utterance.  Note: this is partly
-  // random (will call srand()).
+  const ExampleGenerationConfig& Config() const { return config_; }
+
+  // Given an utterance length, this function creates for you a list of chunks
+  // into which to split the utterance.  Note: this is partly random (will call
+  // srand()).
   void GetChunksForUtterance(int32 utterance_length,
                              std::vector<ChunkTimeInfo> *chunk_info) const;
 
 
+  // This function returns true if 'supervision_length' (e.g. the length of the
+  // posterior, lattice or alignment) is what we expect given
+  // config_.frame_subsampling_factor.  If not, it prints a warning (which is
+  // why the function needs 'utt', and returns false.  Note: we round up, so
+  // writing config_.frame_subsampling_factor as sf, we expect
+  // supervision_length = (utterance_length + sf - 1) / sf.
+  bool LengthsMatch(const std::string &utt,
+                    int32 utterance_length,
+                    int32 supervision_length) const;
+
+
  private:
 
 
   void InitSplitForLength();
 
+  // This function returns the 'default duration' in frames of a split, which if
+  // config_.num_frames_overlap is zero is just the sum of chunk sizes in the
+  // split (i.e. the sum of the vector's elements), but otherwise, we subtract
+  // the recommended overlap (see code for details).
+  float DefaultDurationOfSplit(const std::vector<int32> &split) const;
+
 
   // Used in InitSplitForLength(), returns the maximum utterance-length considered
   // separately in split_for_length_.  [above this, we'll assume that the additional
@@ -220,17 +250,30 @@ class UtteranceSplitter {
                    std::vector<int32> *gap_sizes) const;
 
 
-  // this static function, used in GetGapSizes(), writes values to
-  // a vector 'vec' such the sum of those values equals n.  It
-  // tries to make those values as similar as possible (they will
-  // differ by at most one), and the location of the larger versus
-  // smaller values is random.  n may be negative.  'vec' must be
-  // nonempty.
+  // this static function, used in GetGapSizes(), writes random values to a
+  // vector 'vec' such the sum of those values equals n (n may be positive or
+  // negative).  It tries to make those values as similar as possible (they will
+  // differ by at most one), and the location of the larger versus smaller
+  // values is random. 'vec' must be nonempty.
+  static void DistributeRandomlyUniform(int32 n,
+                                        std::vector<int32> *vec);
+
+  // this static function, used in GetGapSizes(), writes values to a vector
+  // 'vec' such the sum of those values equals n (n may be positive or
+  // negative).  It tries to make those values, as exactly as it can,
+  // proportional to the values in 'magnitudes', which must be positive.  'vec'
+  // must be nonempty, and 'magnitudes' must be the same size as 'vec'.
   static void DistributeRandomly(int32 n,
+                                 const std::vector<int32> &magnitudes,
                                  std::vector<int32> *vec);
 
+  // This function is responsible for setting the 'output_weights'
+  // members of the chunks.
+  void SetOutputWeights(int32 utterance_lengths,
+                        std::vector<ChunkTimeInfo> *chunk_info) const;
+
 
-  const ExampleExtractionConfig &config_;
+  const ExampleGenerationConfig &config_;
 
   // The vector 'split_for_length_' is indexed by the num-frames of a file, and
   // gives us a list of alternative splits that we can use if the utternace has
@@ -246,34 +289,16 @@ class UtteranceSplitter {
 
   // If an utterance's num-frames is >= split_for_length.size(), the way to find
   // the split to use is to keep subtracting the primary num-frames (==
-  // config_.num_frames[0]) from the utterance length until the resulting
-  // num-frames is < split_for_length_.size(), chunks, and then add the subtracted
-  // number of copies of the primary num-frames.
+  // config_.num_frames[0]) minus the num-frames-overlap, from the utterance
+  // length, until the resulting num-frames is < split_for_length_.size(),
+  // chunks, and then add the subtracted number of copies of the primary
+  // num-frames to the split.
   std::vector<std::vector<std::vector<int32> > > splits_for_length_;
 
 
 };
 
 
-void ComputeExampleTimeInfo(const ExampleExtractionConfig &config,
-                            int32 num_frames_in_utt,
-
-                            SplitIntoRanges(int32 num_frames,
-                     int32 frames_per_range,
-                     std::vector<int32> *range_starts);
-
-
-
-
-
-// This function rounds up the quantities 'num_frames' and 'num_frames_overlap'
-// to the nearest multiple of the frame_subsampling_factor
-void RoundUpNumFrames(int32 frame_subsampling_factor,
-                      int32 *num_frames,
-                      int32 *num_frames_overlap);
-
-
-
 
 
 } // namespace nnet3
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
index 786ed609a33..6055dc3d20c 100644
--- a/src/nnet3bin/nnet3-discriminative-get-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -32,155 +32,105 @@
 namespace kaldi {
 namespace nnet3 {
 
-/**
-   This function does all the processing for one utterance, and outputs the
-   supervision objects to 'example_writer'.  
-*/
-
-static bool ProcessFile(
-                        const discriminative::SplitDiscriminativeSupervisionOptions &config,
+// This function does all the processing for one utterance, and outputs the
+// examples to 'example_writer'.
+static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOptions &config,
                         const TransitionModel &tmodel,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const discriminative::DiscriminativeSupervision &supervision,
                         const std::string &utt_id,
                         bool compress,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
-                        int32 frames_overlap_per_eg,
-                        int32 frame_subsampling_factor,
+                        const UtteranceSplitter &utt_splitter,
                         int64 *num_frames_written,
                         int64 *num_egs_written,
                         NnetDiscriminativeExampleWriter *example_writer) {
   KALDI_ASSERT(supervision.num_sequences == 1);
-  int32 num_feature_frames = feats.NumRows(),
-      num_output_frames = supervision.frames_per_sequence,
-      num_feature_frames_subsampled =
-                             (num_feature_frames + frame_subsampling_factor - 1)/
-                             frame_subsampling_factor;
-  if (num_output_frames != num_feature_frames_subsampled)
-    KALDI_ERR << "Mismatch in num-frames: discriminative supervision has "
-              << num_output_frames
-              << " versus features/frame_subsampling_factor = "
-              << num_feature_frames << " / " << frame_subsampling_factor
-              << ": check that --frame-subsampling-factor option is set "
-              << "the same as to discriminative-get-supervision.";
-
-  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
-
-  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
-      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
-      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
-
-  if (frames_per_eg != -1 && num_feature_frames_subsampled < frames_per_eg_subsampled) {
-    KALDI_WARN << "No output for utterance " << utt_id
-               << " (num-frames=" << num_feature_frames
-               << ") because too short for --frames-per-eg="
-               << frames_per_eg;
-    return false;
-  }
+  int32 num_input_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence;
 
-  // we don't do any padding, as it would be a bit tricky to pad the discriminative training supervision.
-  // Instead we select ranges of frames that fully fit within the file;  these
-  // might slightly overlap with each other or have gaps.
-  std::vector<int32> range_starts_subsampled;
-  if (frames_per_eg != -1) {
-    chain::SplitIntoRanges(num_feature_frames_subsampled -
-                           frames_overlap_subsampled,
-                           frames_shift_subsampled,
-                           &range_starts_subsampled);
-  } else {
-    range_starts_subsampled.push_back(0);
-  }
-  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
-  // that we tend to avoid having nonzero weights on the derivatives that are
-  // too close to the edge of the corresponding 'range' (these derivatives close
-  // to the edge are not as accurate as they could be, because when we split we
-  // don't know the correct alphas and betas).
-  std::vector<Vector<BaseFloat> > deriv_weights;
-  if (frames_per_eg != -1) {
-    chain::GetWeightsForRanges(frames_per_eg_subsampled,
-                        range_starts_subsampled,
-                        &deriv_weights);
-
-    if (range_starts_subsampled.empty()) {
-      KALDI_WARN << "No output for utterance " << utt_id
-                 << " (num-frames=" << num_feature_frames
-                 << ") because too short for --frames-per-eg="
-                 << frames_per_eg;
-      return false;
-    }
-  } else {
-    deriv_weights.push_back(Vector<BaseFloat>());
+  if (!utt_splitter.LengthsMatch(utt_id, num_input_frames, num_output_frames))
+    return false;  // LengthsMatch() will have printed a warning.
+
+  std::vector<ChunkTimeInfo> chunks;
+
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
   }
 
-  discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel, 
+  int32 frame_subsampling_factor = utt_splitter.Config().frame_subsampling_factor;
+
+  utt_splitter.GetChunksForUtterance(num_input_frames, &chunks);
+
+  discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel,
                                                              supervision);
 
-  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
+  for (size_t c = 0; c < chunks.size(); c++) {
+    ChunkTimeInfo &chunk = chunks[c];
 
     NnetDiscriminativeExample nnet_discriminative_eg;
     nnet_discriminative_eg.outputs.resize(1);
-    int32 range_start_subsampled = range_starts_subsampled[i],
-        range_start = range_start_subsampled * frame_subsampling_factor;
-    
-    if (frames_per_eg != -1) {
-
-      discriminative::DiscriminativeSupervision supervision_part;
-
-      splitter.GetFrameRange(range_start_subsampled,
-                             frames_per_eg_subsampled,
-                             (i == 0 ? false : true),
-                             &supervision_part);
-
-      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
-                              // that the supervised part starts from frame 0.
-      NnetDiscriminativeSupervision nnet_supervision("output", supervision_part,
-                                                     deriv_weights[i],
-                                                     first_frame, 
-                                                     frame_subsampling_factor);
-      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
-    } else {
-      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
-                              // that the supervised part starts from frame 0.
-      NnetDiscriminativeSupervision nnet_supervision("output", supervision,
-                                                     deriv_weights[i],
-                                                     first_frame, 
-                                                     frame_subsampling_factor);
-      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
-    }
+
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
+
+    discriminative::DiscriminativeSupervision supervision_part;
+
+    splitter.GetFrameRange(start_frame_subsampled,
+                           num_frames_subsampled,
+                           (c == 0 ? false : true),
+                           &supervision_part);
+
+    SubVector<BaseFloat> output_weights(
+        &(chunk.output_weights[0]),
+        static_cast<int32>(chunk.output_weights.size()));
+
+    int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                            // that the supervised part starts from frame 0.
+    NnetDiscriminativeSupervision nnet_supervision("output", supervision_part,
+                                                   output_weights,
+                                                   first_frame,
+                                                   frame_subsampling_factor);
+    nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
 
     nnet_discriminative_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
 
-    int32 this_frames_per_eg = frames_per_eg != -1 ? frames_per_eg : supervision.frames_per_sequence;
 
-    int32 tot_frames = left_context + this_frames_per_eg + right_context;
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
+
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
 
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < this_frames_per_eg + right_context; j++) {
-      int32 t = range_start + j;
-      if (t < 0) t = 0;
-      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
-      SubVector<BaseFloat> src(feats, t),
-          dest(input_frames, j + left_context);
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
+      if (t2 < 0) t2 = 0;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
+      SubVector<BaseFloat> src(feats, t2),
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
-    NnetIo input_io("input", - left_context,
-                    input_frames);
+
+    NnetIo input_io("input", -chunk.left_context, input_frames);
     nnet_discriminative_eg.inputs[0].Swap(&input_io);
 
     if (ivector_feats != NULL) {
       // if applicable, add the iVector feature.
-      // try to get closest frame to middle of window to get
-      // a representative iVector.
-      int32 closest_frame = range_start + this_frames_per_eg / 2;
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (closest_frame >= ivector_feats->NumRows())
-        closest_frame = ivector_feats->NumRows() - 1;
+      // choose iVector from a random frame in the chunk
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       NnetIo ivector_io("ivector", 0, ivector);
       nnet_discriminative_eg.inputs[1].Swap(&ivector_io);
     }
@@ -189,11 +139,11 @@ static bool ProcessFile(
       nnet_discriminative_eg.Compress();
 
     std::ostringstream os;
-    os << utt_id << "-" << range_start;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += this_frames_per_eg;
+    *num_frames_written += chunk.num_frames;
     *num_egs_written += 1;
 
     example_writer->Write(key, nnet_discriminative_eg);
@@ -228,35 +178,28 @@ int main(int argc, char *argv[]) {
         "discriminative-get-supervision.\n";
 
     bool compress = true;
-    int32 left_context = 0, right_context = 0, num_frames = 1,
-        num_frames_overlap = 0, length_tolerance = 100,
-        frame_subsampling_factor = 1;
+    int32 length_tolerance = 100, online_ivector_period = 1;
+
+    std::string online_ivector_rspecifier;
 
-    std::string ivector_rspecifier;
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
     discriminative::SplitDiscriminativeSupervisionOptions splitter_config;
 
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format (recommended)");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.  Will be rounded up to a multiple "
-                "of --frame-subsampling-factor.");
-    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
-                "overlap between each example (could be useful in conjunction "
-                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
-                "Each time we shift by --num-frames minus --num-frames-overlap.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for --online-ivectors "
+                "option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of ivector "
                 "features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                "if the frame-rate at the output will be less than the "
-                "frame-rate of the input");
-    
+    eg_config.Register(&po);
+
     ParseOptions splitter_opts("supervision-splitter", &po);
     splitter_config.Register(&splitter_opts);
 
@@ -267,13 +210,8 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
 
-    if (left_context < 0 || right_context < 0 ||
-        length_tolerance < 0 || frame_subsampling_factor <= 0)
-      KALDI_ERR << "One of the integer options is out of the allowed range.";
-
-    if (frame_subsampling_factor != 1)
-      RoundUpNumFrames(frame_subsampling_factor,
-                       &num_frames, &num_frames_overlap);
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
 
     std::string model_wxfilename, feature_rspecifier,
                 supervision_rspecifier,
@@ -285,7 +223,7 @@ int main(int argc, char *argv[]) {
     examples_wspecifier = po.GetArg(4);
 
     TransitionModel tmodel;
-    { 
+    {
       bool binary;
       Input ki(model_wxfilename, &binary);
       tmodel.Read(ki.Stream(), binary);
@@ -295,7 +233,8 @@ int main(int argc, char *argv[]) {
     discriminative::RandomAccessDiscriminativeSupervisionReader supervision_reader(
         supervision_rspecifier);
     NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
 
     int32 num_done = 0, num_err = 0;
     int64 num_frames_written = 0, num_egs_written = 0;
@@ -308,39 +247,35 @@ int main(int argc, char *argv[]) {
         num_err++;
       } else {
         const discriminative::DiscriminativeSupervision &supervision = supervision_reader.Value(key);
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
-        if (ivector_feats != NULL &&
-            (std::abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
+                                    online_ivector_period)) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
+                     << " and iVectors " << online_ivector_feats->NumRows()
                      << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
+
         if (ProcessFile(splitter_config, tmodel,
-                        feats, ivector_feats, supervision,
-                        key, compress, left_context, right_context, num_frames,
-                        num_frames_overlap, frame_subsampling_factor,
+                        feats, online_ivector_feats, online_ivector_period,
+                        supervision, key, compress, utt_splitter,
                         &num_frames_written, &num_egs_written,
-                        &example_writer))
-          num_done++;
-        else {
-          KALDI_WARN << "Failed to process utterance into nnet example "
-                     << "for key " << key;
-          num_err++;
-        }
+                        &example_writer)) num_done++;
+        else num_err++;
       }
     }
 
@@ -355,4 +290,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 897ffad7b48..6b9dacfa03d 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -19,12 +19,12 @@
 // limitations under the License.
 
 #include <sstream>
-
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -32,85 +32,118 @@ namespace nnet3 {
 
 static void ProcessFile(const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const Posterior &pdf_post,
                         const std::string &utt_id,
                         bool compress,
                         int32 num_pdfs,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
+                        const UtteranceSplitter &utt_splitter,
                         int64 *num_frames_written,
                         int64 *num_egs_written,
                         NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
+  int32 num_input_frames = feats.NumRows();
+  if (!utt_splitter.LengthsMatch(utt_id, num_input_frames,
+                             static_cast<int32>(pdf_post.size())))
+    return;  // LengthsMatch() will have printed a warning.
+
+  std::vector<ChunkTimeInfo> chunks;
+
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
+  }
 
-  for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) {
+  // 'frame_subsampling_factor' is not used in any recipes at the time of
+  // writing, this is being supported to unify the code with the 'chain' recipes
+  // and in case we need it for some reason in future.
+  int32 frame_subsampling_factor =
+      utt_splitter.Config().frame_subsampling_factor;
 
-    // actual_frames_per_eg is the number of frames with nonzero
-    // posteriors.  At the end of the file we pad with zero posteriors
-    // so that all examples have the same structure (prevents the need
-    // for recompilations).
-    int32 actual_frames_per_eg = std::min(frames_per_eg,
-                                          feats.NumRows() - t);
+  utt_splitter.GetChunksForUtterance(num_input_frames, &chunks);
 
+  for (size_t c = 0; c < chunks.size(); c++) {
+    const ChunkTimeInfo &chunk = chunks[c];
 
-    int32 tot_frames = left_context + frames_per_eg + right_context;
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
 
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
 
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
-      int32 t2 = j + t;
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
       if (t2 < 0) t2 = 0;
-      if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
       SubVector<BaseFloat> src(feats, t2),
-          dest(input_frames, j + left_context);
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
 
     NnetExample eg;
 
     // call the regular input "input".
-    eg.io.push_back(NnetIo("input", - left_context,
-                           input_frames));
+    eg.io.push_back(NnetIo("input", -chunk.left_context, input_frames));
 
-    // if applicable, add the iVector feature.
     if (ivector_feats != NULL) {
-      // try to get closest frame to middle of window to get
-      // a representative iVector.
-      int32 closest_frame = t + (actual_frames_per_eg / 2);
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (closest_frame >= ivector_feats->NumRows())
-        closest_frame = ivector_feats->NumRows() - 1;
+      // if applicable, add the iVector feature.
+      // choose iVector from a random frame in the chunk
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       eg.io.push_back(NnetIo("ivector", 0, ivector));
     }
 
-    // add the labels.
-    Posterior labels(frames_per_eg);
-    for (int32 i = 0; i < actual_frames_per_eg; i++)
-      labels[i] = pdf_post[t + i];
-    // remaining posteriors for frames are empty.
-    eg.io.push_back(NnetIo("output", num_pdfs, 0, labels));
+    // Note: chunk.first_frame and chunk.num_frames will both be
+    // multiples of frame_subsampling_factor.
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
+
+    KALDI_ASSERT(start_frame_subsampled + num_frames_subsampled - 1 <
+                 static_cast<int32>(pdf_post.size()));
+
+    // Note: in all current cases there is no subsampling of output-frames going
+    // on (--frame-subsampling-factor=1), so you could read
+    // 'num_frames_subsampled' as just 'num_frames'.
+    Posterior labels(num_frames_subsampled);
+
+    // TODO: it may be that using these weights is not actually helpful (with
+    // chain training, it was not), and that setting them all to 1 is better.
+    // We could add a boolean option to this program to control that; but I
+    // don't want to add such an option if experiments show that it is not
+    // helpful.
+    for (int32 i = 0; i < num_frames_subsampled; i++) {
+      int32 t = i + start_frame_subsampled;
+      labels[i] = pdf_post[t];
+      for (std::vector<std::pair<int32, BaseFloat> >::iterator
+               iter = labels[i].begin(); iter != labels[i].end(); ++iter)
+        iter->second *= chunk.output_weights[i];
+    }
 
     if (compress)
       eg.Compress();
 
     std::ostringstream os;
-    os << utt_id << "-" << t;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += actual_frames_per_eg;
+    *num_frames_written += chunk.num_frames;
     *num_egs_written += 1;
 
     example_writer->Write(key, eg);
   }
 }
 
-
-} // namespace nnet2
+} // namespace nnet3
 } // namespace kaldi
 
 int main(int argc, char *argv[]) {
@@ -140,28 +173,30 @@ int main(int argc, char *argv[]) {
 
 
     bool compress = true;
-    int32 num_pdfs = -1, left_context = 0, right_context = 0,
-        num_frames = 1, length_tolerance = 100;
+    int32 num_pdfs = -1, length_tolerance = 100,
+        online_ivector_period = 1;
 
-    std::string ivector_rspecifier;
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
+
+    std::string online_ivector_rspecifier;
 
     ParseOptions po(usage);
+
     po.Register("compress", &compress, "If true, write egs in "
-                "compressed format.");
+                "compressed format (recommended).");
     po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic "
                 "model");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context of input features that are added to each "
-                "example");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context of input features that are added to each "
-                "example");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
-                "features, as a matrix.");
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
+                "--online-ivectors option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
+                "ivector features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
+    eg_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -173,6 +208,8 @@ int main(int argc, char *argv[]) {
     if (num_pdfs <= 0)
       KALDI_ERR << "--num-pdfs options is required.";
 
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
 
     std::string feature_rspecifier = po.GetArg(1),
         pdf_post_rspecifier = po.GetArg(2),
@@ -182,7 +219,8 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
     RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
 
     int32 num_done = 0, num_err = 0;
     int64 num_frames_written = 0, num_egs_written = 0;
@@ -201,31 +239,32 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
 
-        if (ivector_feats != NULL &&
-            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
+                                    online_ivector_period)) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
+                     << " and iVectors " << online_ivector_feats->NumRows()
                      << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
 
-        ProcessFile(feats, ivector_feats, pdf_post, key, compress,
-                    num_pdfs, left_context, right_context, num_frames,
+        ProcessFile(feats, online_ivector_feats, online_ivector_period,
+                    pdf_post, key, compress, num_pdfs, utt_splitter,
                     &num_frames_written, &num_egs_written,
                     &example_writer);
         num_done++;

From 8573207c84cd5da8f5ca5852eeec7c320b3189ff Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 26 Dec 2016 23:06:41 -0800
Subject: [PATCH 031/213] Some partial work towards getting the new egs-merging
 process working (currently only for regular nnet3 egs)

---
 src/nnet3/nnet-common.cc        |  30 +++
 src/nnet3/nnet-common.h         |   7 +-
 src/nnet3/nnet-computation.cc   |  10 +-
 src/nnet3/nnet-computation.h    |  10 +-
 src/nnet3/nnet-example-utils.cc | 357 ++++++++++++++++++++++++++++++++
 src/nnet3/nnet-example-utils.h  | 193 +++++++++++++++++
 src/nnet3/nnet-example.cc       |  48 +++++
 src/nnet3/nnet-example.h        |  48 +++++
 src/nnet3/nnet-optimize.cc      |  51 ++---
 src/nnet3/nnet-optimize.h       |  11 +-
 src/nnet3bin/nnet3-merge-egs.cc |   9 +-
 src/util/stl-utils.h            |  12 +-
 src/util/timer.h                |  28 ---
 13 files changed, 730 insertions(+), 84 deletions(-)
 delete mode 100644 src/util/timer.h

diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 412fc71341a..906217c3561 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -370,6 +370,36 @@ size_t CindexVectorHasher::operator () (
   return ans;
 }
 
+size_t IndexVectorHasher::operator () (
+    const std::vector<Index> &index_vector) const {
+  size_t n1 = 15, n2 = 10;  // n1 and n2 are used to extract only a subset of
+                            // elements to hash; this makes the hasher faster by
+                            // skipping over more elements.  Setting n1 large or
+                            // n2 to 1 would make the hasher consider all
+                            // elements.
+  // all long-ish numbers appearing below are randomly chosen primes.
+  size_t ans = 1433 + 34949  * index_vector.size();
+  std::vector<Index>::const_iterator iter = index_vector.begin(),
+      end = index_vector.end(), med = end;
+  if (med > iter + n1)
+    med = iter + n1;
+
+  for (; iter != med; ++iter) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
+  }
+  // after the first n1 values, look only at every n2'th value.  this makes the
+  // hashing much faster, and in the kinds of structures that we actually deal
+  // with, we shouldn't get unnecessary hash collisions as a result of this
+  // optimization.
+  for (; iter < end; iter += n2) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
+  }
+  return ans;
+}
 
 std::ostream &operator << (std::ostream &ostream, const Index &index) {
   return ostream << '(' << index.n << ' ' << index.t << ' ' << index.x << ')';
diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index f76166c0758..cb5d8c3b944 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -107,11 +107,16 @@ struct CindexHasher {
   size_t operator () (const Cindex &cindex) const;
 };
 
-
 struct CindexVectorHasher {
   size_t operator () (const std::vector<Cindex> &cindex_vector) const;
 };
 
+// Note: because IndexVectorHasher is used in some things where we really need
+// it to be fast, it doesn't look at all the indexes, just most of them.
+struct IndexVectorHasher {
+  size_t operator () (const std::vector<Index> &index_vector) const;
+};
+
 
 
 // this will only be used for pretty-printing.
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index bb3aaddc829..5be1b7def94 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -1139,7 +1139,15 @@ void NnetComputation::GetWholeSubmatrices(
   }
 }
 
-
+size_t IoSpecificationHasher::operator () (
+    const IoSpecification &io_spec) const {
+  StringHasher string_hasher;
+  IndexVectorHasher indexes_hasher;
+  // 4261 was chosen at random from a list of primes.
+  return string_hasher(io_spec.name) +
+      indexes_hasher(io_spec.indexes) +
+      (io_spec.has_deriv ? 4261 : 0);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index fd8cb06d06b..c7972da2102 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -65,8 +65,10 @@ struct MiscComputationInfo {
 // produce.  For inputs, the name should correspond to an input or component
 // node name in the nnet (components are allowed so context can be provided in
 // recurrent setups); for outputs, the name should be an output node name in the
-// Nnet.  In the normal case there will just be one input and one output, and
-// the indexes will vary only in the t index, with the others all identical.
+// Nnet.
+// note: this structure is used to represent egs both before and after merging
+// into minibatches; if this merging has been done, the indexes will vary in
+// the 'n' dimension.
 struct IoSpecification {
   std::string name;
   std::vector<Index> indexes;
@@ -97,6 +99,10 @@ struct IoSpecification {
   bool operator== (const IoSpecification &other) const;
 };
 
+struct IoSpecificationHasher {
+  size_t operator () (const IoSpecification &io_spec) const;
+};
+
 
 // struct ComputationRequest is whatever we need in addition to the
 // network itself in order to create the structure of a computation.  The most
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index dc9dedefe43..e88eff71e77 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -23,6 +23,7 @@
 #include "hmm/posterior.h"
 #include "util/text-utils.h"
 #include <numeric>
+#include <iomanip>
 
 namespace kaldi {
 namespace nnet3 {
@@ -814,6 +815,362 @@ void UtteranceSplitter::SetOutputWeights(
   }
 }
 
+// static
+bool ExampleMergingConfig::ParseIntSet(const std::string &str,
+                                       ExampleMergingConfig::IntSet *int_set) {
+  std::vector<std::string> split_str;
+  SplitStringToVector(str, ",", false, &split_str);
+  if (split_str.empty())
+    return false;
+  int_set->largest_size = 0;
+  int_set->ranges.resize(split_str.size());
+  for (size_t i = 0; i < split_str.size(); i++) {
+    std::vector<int32> split_range;
+    // note: because we split on '-', it't not possible to
+    // get negative values in 'split_range'.
+    SplitStringToIntegers(str, "-", false, &split_range);
+    if (split_range.size() < 1 || split_range.size() > 2 ||
+        split_range[0] > split_range[1])
+      return false;
+    int_set->ranges[i].first = split_range[0];
+    int_set->ranges[i].second = split_range.back();
+    int_set->largest_size = std::max<int32>(int_set->largest_size,
+                                            split_range.back());
+  }
+  return true;
+}
+
+void ExampleMergingConfig::ComputeDerived() {
+  if (measure_output_frames != "deprecated") {
+    KALDI_WARN << "The --measure-output-frames option is deprecated "
+        "and will be ignored.";
+  }
+  if (discard_partial_minibatches != "deprecated") {
+    KALDI_WARN << "The --discard-partial-minibatches option is deprecated "
+        "and will be ignored.";
+  }
+  std::vector<std::string> minibatch_size_split;
+  SplitStringToVector(minibatch_size, "/", false, &minibatch_size_split);
+  if (minibatch_size_split.empty()) {
+    KALDI_ERR << "Invalid option --minibatch-size=" << minibatch_size;
+  }
+
+  rules.resize(minibatch_size_split.size());
+  for (size_t i = 0; i < minibatch_size_split.size(); i++) {
+    int32 &minibatch_size = rules[i].first;
+    IntSet &int_set = rules[i].second;
+    // 'this_rule' will be either something like "256" or like "64-128,256"
+    // (but these two only if  minibatch_size_split.size() == 1, or something with
+    // an example-size specified, like "256=64-128,256"
+    std::string &this_rule = minibatch_size_split[i];
+    if (this_rule.find('=') != std::string::npos) {
+      std::vector<std::string> rule_split;  // split on '='
+      SplitStringToVector(this_rule, "=", false, &rule_split);
+      if (rule_split.size() != 2) {
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size;
+      }
+      if (!ConvertStringToInteger(rule_split[0], &minibatch_size) ||
+          !ParseIntSet(rule_split[1], &int_set))
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size;
+
+    } else {
+      if (minibatch_size_split.size() != 1) {
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size << " (all rules must have "
+                  << "minibatch-size specified if >1 rule)";
+      }
+      minibatch_size = 0;
+      if (!ParseIntSet(this_rule, &int_set))
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size;
+    }
+  }
+  {
+    // check that no size is repeated.
+    std::vector<int32> all_sizes(minibatch_size_split.size());
+    for (size_t i = 0; i < minibatch_size_split.size(); i++)
+      all_sizes[i] = rules[i].first;
+    std::sort(all_sizes.begin(), all_sizes.end());
+    if (!IsSortedAndUniq(all_sizes)) {
+      KALDI_ERR << "Invalid --minibatch-size=" << minibatch_size
+                << " (repeated example-sizes)";
+    }
+  }
+}
+
+int32 ExampleMergingConfig::MinibatchSize(int32 size_of_eg,
+                                          int32 num_available_egs,
+                                          bool input_ended) const {
+  KALDI_ASSERT(num_available_egs > 0 && size_of_eg > 0);
+  int32 num_rules = rules.size();
+  if (num_rules == 0)
+    KALDI_ERR << "You need to call ComputeDerived() before calling "
+        "MinibatchSize().";
+  int32 min_distance = std::numeric_limits<int32>::max(),
+      closest_rule_index = 0;
+  for (int32 i = 0; i < num_rules; i++) {
+    int32 distance = std::abs(size_of_eg - rules[i].first);
+    if (distance < min_distance) {
+      min_distance = distance;
+      closest_rule_index = i;
+    }
+  }
+  if (!input_ended) {
+    // until the input ends, we can only use the largest available
+    // minibatch-size (otherwise, we could expect more later).
+    int32 largest_size = rules[closest_rule_index].second.largest_size;
+    if (largest_size <= num_available_egs)
+      return largest_size;
+    else
+      return 0;
+  } else {
+    int32 s = rules[closest_rule_index].second.LargestValueInRange(
+        num_available_egs);
+    KALDI_ASSERT(s <= num_available_egs);
+    return s;
+  }
+}
+
+
+void ExampleSizeStats::WroteExample(int32 example_size,
+                                    size_t structure_hash,
+                                    int32 minibatch_size) {
+  std::pair<int32, size_t> p(example_size, structure_hash);
+
+
+  unordered_map<int32, int32> &h = stats_[p].minibatch_to_num_written;
+  unordered_map<int32, int32>::iterator iter = h.find(minibatch_size);
+  if (iter == h.end())
+    h[minibatch_size] = 1;
+  else
+    iter->second += 1;
+}
+
+void ExampleSizeStats::DiscardedExamples(int32 example_size,
+                                         size_t structure_hash,
+                                         int32 num_discarded) {
+  std::pair<int32, size_t> p(example_size, structure_hash);
+  stats_[p].num_discarded += num_discarded;
+}
+
+
+void ExampleSizeStats::PrintStats() const {
+  PrintAggregateStats();
+  PrintSpecificStats();
+}
+
+void ExampleSizeStats::PrintAggregateStats() const {
+  // First print some aggregate stats.
+  int64 num_distinct_egs_types = 0,  // number of distinct types of input egs
+                                     // (differing in size or structure).
+      total_discarded_egs = 0, // total number of discarded egs.
+      total_discarded_egs_size = 0, // total number of discarded egs each multiplied by size
+                                    // of that eg
+      total_non_discarded_egs = 0,  // total over all minibatches written, of
+                                    // minibatch-size, equals number of input egs
+                                    // that were not discarded.
+      total_non_discarded_egs_size = 0,  // total over all minibatches of size-of-eg
+                                     // * minibatch-size.
+      num_minibatches = 0,  // total number of minibatches
+      num_distinct_minibatch_types = 0;  // total number of combination of
+                                         // (type-of-eg, number of distinct
+                                         // minibatch-sizes for that eg-type)-
+                                         // reflects the number of time we have
+                                         // to compile.
+
+  StatsType::const_iterator eg_iter = stats_.begin(), eg_end = stats_.end();
+
+  for (; eg_iter != eg_end; ++eg_iter) {
+    int32 eg_size = eg_iter->first.first;
+    const StatsForExampleSize &stats = eg_iter->second;
+    num_distinct_egs_types++;
+    total_discarded_egs += stats.num_discarded;
+    total_discarded_egs_size += stats.num_discarded * eg_size;
+
+    unordered_map<int32, int32>::const_iterator
+        mb_iter = stats.minibatch_to_num_written.begin(),
+        mb_end = stats.minibatch_to_num_written.end();
+    for (; mb_iter != mb_end; ++mb_iter) {
+      int32 mb_size = mb_iter->first,
+          num_written = mb_iter->second;
+      num_distinct_minibatch_types++;
+      num_minibatches += num_written;
+      total_non_discarded_egs += num_written * mb_size;
+      total_non_discarded_egs_size += num_written * mb_size * eg_size;
+    }
+  }
+  // the averages are written as integers- we don't really need more precision
+  // than that.
+  int64 total_input_egs = total_discarded_egs + total_non_discarded_egs,
+      total_input_egs_size =
+      total_discarded_egs_size + total_non_discarded_egs_size;
+
+  float avg_input_egs_size = total_input_egs_size * 1.0 / total_input_egs;
+  float percent_discarded = total_discarded_egs * 100.0 / total_input_egs;
+  // note: by minibatch size we mean the number of egs per minibatch, it
+  // does not take note of the size of the input egs.
+  float avg_minibatch_size = total_non_discarded_egs * 1.0 / num_minibatches;
+
+  std::ostringstream os;
+  os << std::setprecision(4);
+  os << "Processed " << total_input_egs
+     << " egs of avg. size " << avg_input_egs_size
+     << " into " << num_minibatches << " minibatches, discarding "
+     << percent_discarded <<  "% of egs.  Avg minibatch size was "
+     << avg_minibatch_size << ", distinct types of egs/minibatches "
+     << "was " << num_distinct_egs_types << "/"
+     << num_distinct_minibatch_types;
+  KALDI_LOG << os.str();
+}
+
+void ExampleSizeStats::PrintSpecificStats() const {
+  KALDI_LOG << "Merged specific eg types as follows [format: <eg-size1>="
+      "{<mb-size1>-><num-minibatches1>,<mbsize2>-><num-minibatches2>.../d=<num-discarded>}"
+      ",<egs-size2>={...},... (note,egs-size == number of input "
+      "frames including context).";
+  std::ostringstream os;
+
+  // copy from unordered map to map to get sorting, for consistent output.
+  typedef std::map<std::pair<int32, size_t>, StatsForExampleSize> SortedMapType;
+
+  SortedMapType stats;
+  stats.insert(stats_.begin(), stats_.end());
+  SortedMapType::const_iterator eg_iter = stats.begin(), eg_end = stats.end();
+  for (; eg_iter != eg_end; ++eg_iter) {
+    int32 eg_size = eg_iter->first.first;
+    if (eg_iter != stats.begin())
+      os << ",";
+    os << eg_size << "={";
+    const StatsForExampleSize &stats = eg_iter->second;
+    unordered_map<int32, int32>::const_iterator
+        mb_iter = stats.minibatch_to_num_written.begin(),
+        mb_end =  stats.minibatch_to_num_written.end();
+    for (; mb_iter != mb_end; ++mb_iter) {
+      int32 mb_size = mb_iter->first,
+          num_written = mb_iter->second;
+      if (mb_iter != stats.minibatch_to_num_written.begin())
+        os << ",";
+      os << mb_size << "->" << num_written;
+    }
+    os << ",d=" << stats.num_discarded << "}";
+  }
+  KALDI_LOG << os.str();
+}
+
+
+
+int32 GetNnetExampleSize(const NnetExample &a) {
+  int32 ans = 0;
+  for (size_t i = 0; i < a.io.size(); i++) {
+    int32 s = a.io[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  return ans;
+}
+
+ExampleMerger::ExampleMerger(const ExampleMergingConfig &config,
+                             NnetExampleWriter *writer):
+    finished_(false), num_egs_written_(0),
+    config_(config), writer_(writer) { }
+
+
+void ExampleMerger::AcceptExample(NnetExample *eg) {
+  KALDI_ASSERT(!finished_);
+  // If an eg with the same structure as 'eg' is already a key in the
+  // map, it won't be replaced, but if it's new it will be made
+  // the key.  Also we remove the key before making the vector empty.
+  // This way we ensure that the eg in the key is always the first
+  // element of the vector.
+  std::vector<NnetExample*> &vec = eg_to_egs_[eg];
+  vec.push_back(eg);
+  int32 eg_size = GetNnetExampleSize(*eg),
+      num_available = vec.size();
+  bool input_ended = false;
+  int32 minibatch_size = config_.MinibatchSize(eg_size, num_available,
+                                               input_ended);
+  if (minibatch_size != 0) {  // we need to write out a merged eg.
+    KALDI_ASSERT(minibatch_size == num_available);
+
+    std::vector<NnetExample*> vec_copy(vec);
+    eg_to_egs_.erase(eg);
+
+    // MergeExamples() expects a vector of NnetExample, not of pointers,
+    // so use swap to create that without doing any real work.
+    std::vector<NnetExample> egs_to_merge(minibatch_size);
+    for (int32 i = 0; i < minibatch_size; i++) {
+      egs_to_merge[i].Swap(vec[i]);
+      delete vec[i];  // we owned those pointers.
+    }
+    WriteMinibatch(egs_to_merge);
+  }
+}
+
+void ExampleMerger::WriteMinibatch(const std::vector<NnetExample> &egs) {
+  KALDI_ASSERT(!egs.empty());
+  int32 eg_size = GetNnetExampleSize(egs[0]);
+  NnetExampleStructureHasher eg_hasher;
+  size_t structure_hash = eg_hasher(egs[0]);
+  int32 minibatch_size = egs.size();
+  stats_.WroteExample(eg_size, structure_hash, minibatch_size);
+  NnetExample merged_eg;
+  MergeExamples(egs, config_.compress, &merged_eg);
+  std::ostringstream key;
+  key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
+  writer_->Write(key.str(), merged_eg);
+}
+
+void ExampleMerger::Finish() {
+  if (finished_) return;  // already finished.
+  finished_ = true;
+
+  // we'll convert the map eg_to_egs_ to a vector of vectors to avoid
+  // iterator invalidation problems.
+  std::vector<std::vector<NnetExample*> > all_egs;
+  all_egs.reserve(eg_to_egs_.size());
+
+  MapType::iterator iter = eg_to_egs_.begin(), end = eg_to_egs_.end();
+  for (; iter != end; ++iter)
+    all_egs.push_back(iter->second);
+  eg_to_egs_.clear();
+
+  for (size_t i = 0; i < all_egs.size(); i++) {
+    int32 minibatch_size;
+    std::vector<NnetExample*> &vec = all_egs[i];
+    KALDI_ASSERT(!vec.empty());
+    int32 eg_size = GetNnetExampleSize(*(vec[0]));
+    bool input_ended = true;
+    while (!vec.empty() &&
+           (minibatch_size = config_.MinibatchSize(eg_size, vec.size(),
+                                                   input_ended)) != 0) {
+      // MergeExamples() expects a vector of NnetExample, not of pointers,
+      // so use swap to create that without doing any real work.
+      std::vector<NnetExample> egs_to_merge(minibatch_size);
+      for (int32 i = 0; i < minibatch_size; i++) {
+        egs_to_merge[i].Swap(vec[i]);
+        delete vec[i];  // we owned those pointers.
+      }
+      vec.erase(vec.begin(), vec.begin() + minibatch_size);
+      WriteMinibatch(egs_to_merge);
+    }
+    if (!vec.empty()) {
+      int32 eg_size = GetNnetExampleSize(*(vec[0]));
+      NnetExampleStructureHasher eg_hasher;
+      size_t structure_hash = eg_hasher(*(vec[0]));
+      int32 num_discarded = vec.size();
+      stats_.DiscardedExamples(eg_size, structure_hash, num_discarded);
+      for (int32 i = 0; i < num_discarded; i++)
+        delete vec[i];
+      vec.clear();
+    }
+  }
+
+
+
+}
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 754743d581e..75a47772fda 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -23,6 +23,7 @@
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-computation.h"
 #include "nnet3/nnet-compute.h"
+#include "util/kaldi-table.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -294,11 +295,203 @@ class UtteranceSplitter {
   // chunks, and then add the subtracted number of copies of the primary
   // num-frames to the split.
   std::vector<std::vector<std::vector<int32> > > splits_for_length_;
+};
+
+
+class ExampleMergingConfig {
+public:
+  // The following configuration values are registered on the command line.
+  bool compress;
+  std::string measure_output_frames;  // for back-compatibility, not used.
+  std::string minibatch_size;
+  std::string discard_partial_minibatches;   // for back-compatibility, not used.
+
+  ExampleMergingConfig(): compress(false),
+                          measure_output_frames("deprecated"),
+                          minibatch_size("256"),
+                          discard_partial_minibatches("deprecated") { }
+
+  void Register(OptionsItf *po) {
+    po->Register("compress", &compress, "If true, compress the output examples "
+                 "(not recommended unless you are writing to disk)");
+    po->Register("measure-output-frames", &measure_output_frames, "This "
+                 "value will be ignored (included for back-compatibility)");
+    po->Register("discard-partial-minibatches", &discard_partial_minibatches,
+                 "This value will be ignored (included for back-compatibility)");
+    po->Register("minibatch-size", &minibatch_size,
+                 "String controlling the minibatch size.  May be just an integer, "
+                 "meaning a fixed minibatch size (e.g. --minibatch-size=128). "
+                 "May be a list of ranges and values, e.g. --minibatch-size=32,64 "
+                 "or --minibatch-size=16-32,64,128.  All minibatches will be of "
+                 "the largest size until the end of the input is reached; "
+                 "then, increasingly smaller sizes will be allowed.  Only egs "
+                 "with the same structure (e.g num-frames) are merged.  You may "
+                 "specify different minibatch sizes for different sizes of eg "
+                 "(defined as the maximum number of Indexes on any input), in "
+                 "the format "
+                 "--minibatch-size='eg_size1=mb_sizes1/eg_size2=mb_sizes2', e.g. "
+                 "--minibatch-size=128=64-128,256/256=32-64,128.  Egs are given "
+                 "minibatch-sizes based on the specified eg-size closest to "
+                 "their actual size.");
+  }
+
+
+  // this function computes the derived (private) parameters; it must be called
+  // after the command-line parameters are read and before MinibatchSize() is
+  // called.
+  void ComputeDerived();
+
+  /// This function tells you what minibatch size should be used for this eg.
+
+  ///  @param [in] size_of_eg   The "size" of the eg, as obtained by
+  ///                           GetNnetExampleSize() or a similar function (up
+  ///                           to the caller).
+  ///  @param [in] num_available_egs   The number of egs of this size that are
+  ///                            currently available; should be >0.  The
+  ///                            value returned will be <= this value, possibly
+  ///                            zero.
+  ///  @param [in] input_ended   True if the input has ended, false otherwise.
+  ///                            This is important because before the input has
+  ///                            ended, we will only batch egs into the largest
+  ///                            possible minibatch size among the range allowed
+  ///                            for that size of eg.
+  ///  @return                   Returns the minibatch size to use in this
+  ///                            situation, as specified by the configuration.
+  int32 MinibatchSize(int32 size_of_eg,
+                      int32 num_available_egs,
+                      bool input_ended) const;
 
 
+ private:
+  // struct IntSet is a representation of something like 16-32,64, which is a
+  // nonempty list of either nonnegative integers or ranges of nonnegative
+  // integers.  Conceptually it represents a set of nonnegative integers.
+  struct IntSet {
+    // largest_size is the largest integer in any of the ranges (64 in this
+    // example).
+    int32 largest_size;
+    // e.g. would contain ((16,32), (64,64)) in this example.
+    std::vector<std::pair<int32, int32> > ranges;
+    // Returns the largest value in any range (i.e. in the set of
+    // integers that this struct represents), that is <= max_value,
+    // or 0 if there is no value in any range that is <= max_value.
+    // In this example, this function would return the following:
+    // 128->64, 64->64, 63->32, 31->31, 16->16, 15->0, 0->0
+    int32 LargestValueInRange(int32 max_value) const;
+  };
+  static bool ParseIntSet(const std::string &str, IntSet *int_set);
+
+  // 'rules' is derived from the configuration values above by ComputeDerived(),
+  // and are not set directly on the command line.  'rules' is a list of pairs
+  // (eg-size, int-set-of-minibatch-sizes); If no explicit eg-sizes were
+  // specified on the command line (i.e. there was no '=' sign in the
+  // --minibatch-size option), then we just set the int32 to 0.
+  std::vector<std::pair<int32, IntSet> > rules;
 };
 
 
+/// This function returns the 'size' of a nnet-example as defined for purposes
+/// of merging egs, which is defined as the largest number of Indexes in any of
+/// the inputs or outputs of the example.
+int32 GetNnetExampleSize(const NnetExample &a);
+
+
+
+
+
+/// This class is responsible for storing, and displaying in log messages,
+/// statistics about how examples of different sizes (c.f. GetNnetExampleSize())
+/// were merged into minibatches, and how many examples were left over and
+/// discarded.
+class ExampleSizeStats {
+ public:
+
+  /// Users call this function to inform this class that one minibatch has been
+  /// written aggregating 'minibatch_size' separate examples of original size
+  /// 'example_size' (e.g. as determined by GetNnetExampleSize(), but the caller
+  /// does that.
+  /// The 'structure_hash' is provided so that this class can distinguish
+  /// between egs that have the same size but different structure.  In the
+  /// extremely unlikely eventuality that there is a hash collision, it will
+  /// cause misleading stats to be printed out.
+  void WroteExample(int32 example_size, size_t structure_hash,
+                    int32 minibatch_size);
+
+  /// Users call this function to inform this class that after processing all
+  /// the data, for examples of original size 'example_size', 'num_discarded'
+  /// examples could not be put into a minibatch and were discarded.
+  void DiscardedExamples(int32 example_size, size_t structure_hash,
+                         int32 num_discarded);
+
+  /// Calling this will cause a log message with information about the
+  /// examples to be printed.
+  void PrintStats() const;
+
+ private:
+  // this struct stores the stats for examples of a particular size and
+  // structure.
+  struct StatsForExampleSize {
+    int32 num_discarded;
+    // maps from minibatch-size (i.e. number of egs that were
+    // aggregated into that minibatch), to the number of such
+    // minibatches written.
+    unordered_map<int32, int32> minibatch_to_num_written;
+    StatsForExampleSize(): num_discarded(0) { }
+  };
+
+
+  typedef unordered_map<std::pair<int32, size_t>, StatsForExampleSize,
+                        PairHasher<int32, size_t> > StatsType;
+
+  // this maps from a pair (example_size, structure_hash) to to the stats for
+  // examples with those characteristics.
+  StatsType stats_;
+
+  void PrintAggregateStats() const;
+  void PrintSpecificStats() const;
+
+};
+
+
+/// This class is responsible for arranging examples in groups
+/// that have the same strucure (i.e. the same input and output
+/// indexes), and outputting them in suitable minibatches
+/// as defined by ExampleMergingConfig.
+class ExampleMerger {
+  ExampleMerger(const ExampleMergingConfig &config,
+                NnetExampleWriter *writer);
+
+  // This function accepts an example, and if possible, writes a merged example
+  // out.  The ownership of the pointer 'a' is transferred to this class when
+  // you call this function.
+  void AcceptExample(NnetExample *a);
+
+  // This function announces to the class that the input has finished, so it
+  // should flush out any smaller-sizes minibatches, as dictated by the config.
+  // This will be called in the destructor, but you can call it explicitly when
+  // all the input is done if you want to.
+  // It also prints the stats.
+  void Finish();
+
+  ~ExampleMerger() { Finish(); };
+ private:
+  // called by Finish() and AcceptExample().  Merges, updates the
+  // stats, and writes.
+  void WriteMinibatch(const std::vector<NnetExample> &egs);
+
+  bool finished_;
+  int32 num_egs_written_;
+  const ExampleMergingConfig &config_;
+  NnetExampleWriter *writer_;
+  ExampleSizeStats stats_;
+
+  // Note: the "key" into the egs is the first element of the vector.
+  typedef unordered_map<NnetExample*, std::vector<NnetExample*>,
+                        NnetExampleStructureHasher,
+                        NnetExampleStructureCompare> MapType;
+   MapType eg_to_egs_;
+};
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-example.cc b/src/nnet3/nnet-example.cc
index 9a34258e0ee..3e87ebba3f5 100644
--- a/src/nnet3/nnet-example.cc
+++ b/src/nnet3/nnet-example.cc
@@ -122,5 +122,53 @@ void NnetExample::Compress() {
     iter->features.Compress();
 }
 
+
+size_t NnetIoStructureHasher::operator () (
+    const NnetIo &io) const {
+  StringHasher string_hasher;
+  IndexVectorHasher indexes_hasher;
+
+  // numbers appearing here were taken at random from a list of primes.
+  size_t ans = string_hasher(io.name) +
+      indexes_hasher(io.indexes) +
+      19249  * io.features.NumRows() +
+      14731 * io.features.NumCols();
+  return ans;
+}
+
+
+bool NnetIoStructureCompare::operator () (
+    const NnetIo &a, const NnetIo &b) const {
+  return a.name == b.name &&
+      a.features.NumRows() == b.features.NumRows() &&
+      a.features.NumCols() == b.features.NumCols() &&
+      a.indexes == b.indexes;
+}
+
+
+size_t NnetExampleStructureHasher::operator () (
+    const NnetExample &eg) const {
+  // these numbers were chosen at random from a list of primes.
+  NnetIoStructureHasher io_hasher;
+  size_t size = eg.io.size(), ans = size * 35099;
+  for (size_t i = 0; i < size; i++)
+    ans = ans * 19157 + io_hasher(eg.io[i]);
+  return ans;
+}
+
+bool NnetExampleStructureCompare::operator () (const NnetExample &a,
+                                               const NnetExample &b) const {
+  NnetIoStructureCompare io_compare;
+  if (a.io.size() != b.io.size())
+    return false;
+  size_t size = a.io.size();
+  for (size_t i = 0; i < size; i++)
+    if (!io_compare(a.io[i], b.io[i]))
+      return false;
+  return true;
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h
index 1df7cd1e78e..f08754a2bd3 100644
--- a/src/nnet3/nnet-example.h
+++ b/src/nnet3/nnet-example.h
@@ -75,6 +75,22 @@ struct NnetIo {
 };
 
 
+/// This hashing object hashes just the structural aspects of the NnetIo object
+/// (name, indexes, feature dimension) without looking at the value of features.
+/// It will be used in combining egs into batches of all similar structure.
+struct NnetIoStructureHasher {
+  size_t operator () (const NnetIo &a) const;
+};
+/// This comparison object compares just the structural aspects of the NnetIo
+/// object (name, indexes, feature dimension) without looking at the value of
+/// features.  It will be used in combining egs into batches of all similar
+/// structure.
+struct NnetIoStructureCompare {
+  bool operator () (const NnetIo &a,
+                    const NnetIo &b) const;
+};
+
+
 
 /// NnetExample is the input data and corresponding label (or labels) for one or
 /// more frames of input, used for standard cross-entropy training of neural
@@ -104,6 +120,38 @@ struct NnetExample {
 };
 
 
+/// This hashing object hashes just the structural aspects of the NnetExample
+/// without looking at the value of the features.  It will be used in combining
+/// egs into batches of all similar structure.  Note: the hash value is
+/// sensitive to the order in which the NnetIo elements (input and outputs)
+/// appear, even though the merging is capable of dealing with
+/// differently-ordered inputs and outputs (e.g.  "input" appearing before
+/// vs. after "ivector" or "output").  We don't think anyone would ever have to
+/// deal with differently-ordered, but otherwise identical, egs in practice so
+/// we don't bother making the hashing function independent of this order.
+struct NnetExampleStructureHasher {
+  size_t operator () (const NnetExample &eg) const;
+  // We also provide a version of this that works from pointers.
+  size_t operator () (const NnetExample *eg) const { return (*this)(*eg); }
+};
+
+
+/// This comparator object compares just the structural aspects of the
+/// NnetExample without looking at the value of the features.  Like
+/// NnetExampleStructureHasher, it is sensitive to the order in which the
+/// differently-named NnetIo elements appear.  This hashing object will be used
+/// in combining egs into batches of all similar structure.
+struct NnetExampleStructureCompare {
+  bool operator () (const NnetExample &a,
+                    const NnetExample &b) const;
+  // We also provide a version of this that works from pointers.
+  bool operator () (const NnetExample *a,
+                    const NnetExample *b) const { return (*this)(*a, *b); }
+
+};
+
+
+
 typedef TableWriter<KaldiObjectHolder<NnetExample > > NnetExampleWriter;
 typedef SequentialTableReader<KaldiObjectHolder<NnetExample > > SequentialNnetExampleReader;
 typedef RandomAccessTableReader<KaldiObjectHolder<NnetExample > > RandomAccessNnetExampleReader;
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 6da7699cb93..54ebf17edc7 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -20,6 +20,7 @@
 
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-optimize-utils.h"
+#include "base/timer.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -532,45 +533,16 @@ void Optimize(const NnetOptimizeOptions &config,
 // of inputs and outputs
 size_t ComputationRequestHasher::operator() (const ComputationRequest *cr) const {
   size_t ans = 0;
+  size_t p1 = 4111, p2 = 26951;
+  IoSpecificationHasher io_hasher;
   std::vector<IoSpecification>::const_iterator itr = cr->inputs.begin(),
                                                end = cr->inputs.end();
-  for (; itr != end; ++itr) {
-    ans += IoSpecificationToInt(*itr);
-  }
+  for (; itr != end; ++itr)
+    ans = ans * p1 + io_hasher(*itr);
   itr = cr->outputs.begin();
   end = cr->outputs.end();
-  for (; itr != end; ++itr) {
-    ans += IoSpecificationToInt(*itr);
-  }
-  return ans;
-}
-
-size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spec) const {
-  size_t ans;
-  size_t n = 19;  // this value is used to extract only a subset of elements to hash;
-                  // it makes the hasher faster.
-  StringHasher string_hasher;
-  ans = string_hasher(spec.name);
-  std::vector<Index>::const_iterator iter = spec.indexes.begin(),
-      end = spec.indexes.end(),
-      med = end;
-  if (med > iter + n)
-    med = iter + n;
-
-  for (; iter != med; ++iter) {
-    ans += iter->n * 1619;
-    ans += iter->t * 15649;
-    ans += iter->x * 89809;
-  }
-  // after the first 'n' values, look only at every n'th value.  this makes the
-  // hashing much faster, and in the kinds of structures that we actually deal
-  // with, we shouldn't get unnecessary hash collisions as a result of this
-  // optimization.
-  for (; iter < end; iter += n) {
-    ans += iter->n * 1619;
-    ans += iter->t * 15649;
-    ans += iter->x * 89809;
-  }
+  for (; itr != end; ++itr)
+    ans = ans * p2 + io_hasher(*itr);
   return ans;
 }
 
@@ -643,20 +615,25 @@ CachingOptimizingCompiler::~CachingOptimizingCompiler() {
     delete itr->first;
     delete itr->second.first;
   }
+  KALDI_LOG << seconds_taken_ << " seconds taken in nnet3 compilation";
 }
 
 const NnetComputation* CachingOptimizingCompiler::Compile(
     const ComputationRequest  &in_request) {
+  Timer timer;
+  const NnetComputation *ans;
   // find computation in the cache
   CacheType::iterator cit = computation_cache_.find(&in_request);
   if (cit == computation_cache_.end()) {
-    return CompileAndCache(in_request);
+    ans = CompileAndCache(in_request);
   } else {
     // if found, update access queue
     const NnetComputation *computation = cit->second.first;
     UpdateAccessQueue(cit);
-    return computation;
+    ans = computation;
   }
+  seconds_taken_ += timer.Elapsed();
+  return ans;
 }
 
 const NnetComputation* CachingOptimizingCompiler::CompileAndCache(
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 1ca776d4ee6..ab0721e802a 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -164,9 +164,6 @@ void Optimize(const NnetOptimizeOptions &config,
 // and output IoSpecifications vectors.
 struct ComputationRequestHasher {
   size_t operator()(const ComputationRequest *cr) const;
- private:
-  size_t IoSpecificationToInt(const IoSpecification& spec) const;
-  static const int kPrime = 7853;
 };
 
 // Equality function for ComputationRequest pointer
@@ -210,14 +207,15 @@ class CachingOptimizingCompiler {
   CachingOptimizingCompiler(const Nnet &nnet,
                             const CachingOptimizingCompilerOptions config =
                             CachingOptimizingCompilerOptions()):
-      nnet_(nnet), config_(config) { }
+      nnet_(nnet), config_(config), seconds_taken_(0.0) { }
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
                             const CachingOptimizingCompilerOptions config =
                             CachingOptimizingCompilerOptions()):
-      nnet_(nnet), config_(config), opt_config_(opt_config) { }
+      nnet_(nnet), config_(config), opt_config_(opt_config),
+      seconds_taken_(0.0) { }
 
   ~CachingOptimizingCompiler();
   /// Does the compilation and returns a const pointer to
@@ -276,6 +274,9 @@ class CachingOptimizingCompiler {
                         ComputationRequestPtrEqual> CacheType;
   CacheType computation_cache_;
 
+  // time spent in compilation-- for diagnostic messages
+  double seconds_taken_;
+
   // This function updates the computation cache. It is called within Compile().
   // It takes ownership of the pointers.  It inserts the request at the end of
   // the queue, and purges the least-recently-accessed request from the queue and
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 0e40de8aeae..48ba2986512 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -74,9 +74,8 @@ int main(int argc, char *argv[]) {
     po.Register("compress", &compress, "If true, compress the output examples "
                 "(not recommended unless you are writing to disk)");
     po.Register("discard-partial-minibatches", &discard_partial_minibatches,
-                "discard any partial minibatches of 'uneven' size that may be "
-                "encountered at the end; 'true' is recommended, to avoid "
-                "incurring compilation costs.");
+		"discard any partial minibatches of 'uneven' size that may be "
+		"encountered at the end.");
 
     po.Read(argc, argv);
 
@@ -112,7 +111,7 @@ int main(int argc, char *argv[]) {
       num_read++;
 
       if (minibatch_ready || (!discard_partial_minibatches &&
-                              (example_reader.Done() && !examples.empty()))) {
+			      (example_reader.Done() && !examples.empty()))) {
         NnetExample merged_eg;
         MergeExamples(examples, compress, &merged_eg);
         std::ostringstream ostr;
@@ -131,3 +130,5 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
+
+
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index d37e4d2d203..b5f8f246d95 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -245,16 +245,16 @@ struct VectorHasher {  // hashing function for vector<Int>.
 };
 
 /// A hashing function-object for pairs of ints
-template<typename Int>
+template<typename Int1, typename Int2 = Int1>
 struct PairHasher {  // hashing function for pair<int>
-  size_t operator()(const std::pair<Int, Int> &x) const {
-    return x.first + x.second * kPrime;
+  size_t operator()(const std::pair<Int1, Int2> &x) const {
+    // 7853 was chosen at random from a list of primes.
+    return x.first + x.second * 7853;
   }
   PairHasher() {  // Check we're instantiated with an integer type.
-    KALDI_ASSERT_IS_INTEGER_TYPE(Int);
+    KALDI_ASSERT_IS_INTEGER_TYPE(Int1);
+    KALDI_ASSERT_IS_INTEGER_TYPE(Int2);
   }
- private:
-  static const int kPrime = 7853;
 };
 
 
diff --git a/src/util/timer.h b/src/util/timer.h
deleted file mode 100644
index 3b92b48b603..00000000000
--- a/src/util/timer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// util/timer.h
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-// We are temporarily leaving this file to forward #includes to
-// base-timer.h.  Its use is deprecated; you should directrly
-// #include base/timer.h
-#ifndef KALDI_UTIL_TIMER_H_
-#define KALDI_UTIL_TIMER_H_
-#pragma message warning: please do not include util/timer.h, \
-  include base/timer.h(it has been moved)
-#include "base/timer.h"
-#endif  // KALDI_UTIL_TIMER_H_

From fe1ccaa55359f94c0f99cc2e3e66a07dd1b3c953 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Wed, 28 Dec 2016 13:08:09 -0800
Subject: [PATCH 032/213] Finish upgrades to eg-merging code for chain and
 discriminative examples.  Compiles but not tested.

---
 src/chainbin/nnet3-chain-merge-egs.cc         |  51 ++----
 src/nnet3/nnet-chain-example.cc               | 161 ++++++++++++++++-
 src/nnet3/nnet-chain-example.h                |  80 +++++++++
 src/nnet3/nnet-discriminative-example.cc      | 166 +++++++++++++++++-
 src/nnet3/nnet-discriminative-example.h       |  84 ++++++++-
 src/nnet3/nnet-example-utils.cc               |  20 ++-
 src/nnet3/nnet-example-utils.h                |   4 +
 .../nnet3-discriminative-merge-egs.cc         |  43 ++---
 src/nnet3bin/nnet3-merge-egs.cc               |  57 ++----
 src/util/kaldi-holder-inl.h                   |   1 +
 10 files changed, 534 insertions(+), 133 deletions(-)

diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc
index 249b5cec0c0..9c91f997e7a 100644
--- a/src/chainbin/nnet3-chain-merge-egs.cc
+++ b/src/chainbin/nnet3-chain-merge-egs.cc
@@ -41,19 +41,12 @@ int main(int argc, char *argv[]) {
         "nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ... \n"
         "See also nnet3-chain-copy-egs\n";
 
-    bool compress = false;
-    int32 minibatch_size = 64;
-    bool discard_partial_minibatches = true;
-
+    ExampleMergingConfig merging_config;
+    merging_config.minibatch_size = 64;  // change the default for this
+                                         // program.. anyway it will usually be
+                                         // set on the command line.
     ParseOptions po(usage);
-    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
-                "when merging (see also --measure-output-frames)");
-    po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk");
-    po.Register("discard-partial-minibatches", &discard_partial_minibatches,
-                "discard any partial minibatches of 'uneven' size that may be "
-                "encountered at the end; 'true' is recommended, to avoid "
-                "incurring compilation costs.");
+    merging_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -68,38 +61,18 @@ int main(int argc, char *argv[]) {
     SequentialNnetChainExampleReader example_reader(examples_rspecifier);
     NnetChainExampleWriter example_writer(examples_wspecifier);
 
-    std::vector<NnetChainExample> examples;
-    examples.reserve(minibatch_size);
-
-    int64 num_read = 0, num_written = 0;
+    ChainExampleMerger merger(merging_config, &example_writer);
     while (!example_reader.Done()) {
       const NnetChainExample &cur_eg = example_reader.Value();
-      examples.resize(examples.size() + 1);
-      examples.back() = cur_eg;
-
-      bool minibatch_ready =
-          static_cast<int32>(examples.size()) >= minibatch_size;
-
-      // Do Next() now, so we can test example_reader.Done() below .
-      example_reader.Next();
-      num_read++;
-
-      if (minibatch_ready || (!discard_partial_minibatches &&
-          (example_reader.Done() && !examples.empty()))) {
-        NnetChainExample merged_eg;
-        MergeChainExamples(compress, &examples, &merged_eg);
-        std::ostringstream ostr;
-        ostr << "merged-" << num_written;
-        num_written++;
-        std::string output_key = ostr.str();
-        example_writer.Write(output_key, merged_eg);
-        examples.clear();
-      }
+      merger.AcceptExample(new NnetChainExample(cur_eg));
     }
-    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
-    return (num_written != 0 ? 0 : 1);
+    // the merger itself prints the necessary diagnostics.
+    merger.Finish();
+    return merger.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
+
+
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 0607543b743..b1c6e60de47 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -207,8 +207,8 @@ static void MergeSupervision(
   std::vector<chain::Supervision> output_supervision;
   bool compactify = true;
   AppendSupervision(input_supervision,
-                         compactify,
-                         &output_supervision);
+                    compactify,
+                    &output_supervision);
   if (output_supervision.size() != 1)
     KALDI_ERR << "Failed to merge 'chain' examples-- inconsistent lengths "
               << "or weights?";
@@ -300,7 +300,7 @@ void TruncateDerivWeights(int32 truncate,
       deriv_weights.Set(1.0);
     }
     int32 num_sequences = supervision.supervision.num_sequences,
-       frames_per_sequence = supervision.supervision.frames_per_sequence;
+        frames_per_sequence = supervision.supervision.frames_per_sequence;
     KALDI_ASSERT(2 * truncate  < frames_per_sequence);
     for (int32 t = 0; t < truncate; t++)
       for (int32 s = 0; s < num_sequences; s++)
@@ -421,5 +421,160 @@ void ShiftChainExampleTimes(int32 frame_shift,
   }
 }
 
+
+size_t NnetChainExampleStructureHasher::operator () (
+    const NnetChainExample &eg) const {
+  // these numbers were chosen at random from a list of primes.
+  NnetIoStructureHasher io_hasher;
+  size_t size = eg.inputs.size(), ans = size * 35099;
+  for (size_t i = 0; i < size; i++)
+    ans = ans * 19157 + io_hasher(eg.inputs[i]);
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    const NnetChainSupervision &sup = eg.outputs[i];
+    StringHasher string_hasher;
+    IndexVectorHasher indexes_hasher;
+    ans = ans * 17957 +
+        string_hasher(sup.name) + indexes_hasher(sup.indexes);
+  }
+  return ans;
+}
+
+bool NnetChainExampleStructureCompare::operator () (
+    const NnetChainExample &a,
+    const NnetChainExample &b) const {
+  NnetIoStructureCompare io_compare;
+  if (a.inputs.size() != b.inputs.size() ||
+      a.outputs.size() != b.outputs.size())
+    return false;
+  size_t size = a.inputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (!io_compare(a.inputs[i], b.inputs[i]))
+      return false;
+  size = a.outputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (a.outputs[i].name != b.outputs[i].name ||
+        a.outputs[i].indexes != b.outputs[i].indexes)
+      return false;
+  return true;
+}
+
+
+int32 GetNnetChainExampleSize(const NnetChainExample &a) {
+  int32 ans = 0;
+  for (size_t i = 0; i < a.inputs.size(); i++) {
+    int32 s = a.inputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  for (size_t i = 0; i < a.outputs.size(); i++) {
+    int32 s = a.outputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  return ans;
+}
+
+
+ChainExampleMerger::ChainExampleMerger(const ExampleMergingConfig &config,
+                                       NnetChainExampleWriter *writer):
+    finished_(false), num_egs_written_(0),
+    config_(config), writer_(writer) { }
+
+
+void ChainExampleMerger::AcceptExample(NnetChainExample *eg) {
+  KALDI_ASSERT(!finished_);
+  // If an eg with the same structure as 'eg' is already a key in the
+  // map, it won't be replaced, but if it's new it will be made
+  // the key.  Also we remove the key before making the vector empty.
+  // This way we ensure that the eg in the key is always the first
+  // element of the vector.
+  std::vector<NnetChainExample*> &vec = eg_to_egs_[eg];
+  vec.push_back(eg);
+  int32 eg_size = GetNnetChainExampleSize(*eg),
+      num_available = vec.size();
+  bool input_ended = false;
+  int32 minibatch_size = config_.MinibatchSize(eg_size, num_available,
+                                               input_ended);
+  if (minibatch_size != 0) {  // we need to write out a merged eg.
+    KALDI_ASSERT(minibatch_size == num_available);
+
+    std::vector<NnetChainExample*> vec_copy(vec);
+    eg_to_egs_.erase(eg);
+
+    // MergeChainExamples() expects a vector of NnetChainExample, not of pointers,
+    // so use swap to create that without doing any real work.
+    std::vector<NnetChainExample> egs_to_merge(minibatch_size);
+    for (int32 i = 0; i < minibatch_size; i++) {
+      egs_to_merge[i].Swap(vec[i]);
+      delete vec[i];  // we owned those pointers.
+    }
+    WriteMinibatch(&egs_to_merge);
+  }
+}
+
+void ChainExampleMerger::WriteMinibatch(
+    std::vector<NnetChainExample> *egs) {
+  KALDI_ASSERT(!egs->empty());
+  int32 eg_size = GetNnetChainExampleSize((*egs)[0]);
+  NnetChainExampleStructureHasher eg_hasher;
+  size_t structure_hash = eg_hasher((*egs)[0]);
+  int32 minibatch_size = egs->size();
+  stats_.WroteExample(eg_size, structure_hash, minibatch_size);
+  NnetChainExample merged_eg;
+  MergeChainExamples(config_.compress, egs, &merged_eg);
+  std::ostringstream key;
+  key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
+  writer_->Write(key.str(), merged_eg);
+}
+
+void ChainExampleMerger::Finish() {
+  if (finished_) return;  // already finished.
+  finished_ = true;
+
+  // we'll convert the map eg_to_egs_ to a vector of vectors to avoid
+  // iterator invalidation problems.
+  std::vector<std::vector<NnetChainExample*> > all_egs;
+  all_egs.reserve(eg_to_egs_.size());
+
+  MapType::iterator iter = eg_to_egs_.begin(), end = eg_to_egs_.end();
+  for (; iter != end; ++iter)
+    all_egs.push_back(iter->second);
+  eg_to_egs_.clear();
+
+  for (size_t i = 0; i < all_egs.size(); i++) {
+    int32 minibatch_size;
+    std::vector<NnetChainExample*> &vec = all_egs[i];
+    KALDI_ASSERT(!vec.empty());
+    int32 eg_size = GetNnetChainExampleSize(*(vec[0]));
+    bool input_ended = true;
+    while (!vec.empty() &&
+           (minibatch_size = config_.MinibatchSize(eg_size, vec.size(),
+                                                   input_ended)) != 0) {
+      // MergeChainExamples() expects a vector of
+      // NnetChainExample, not of pointers, so use swap to create that
+      // without doing any real work.
+      std::vector<NnetChainExample> egs_to_merge(minibatch_size);
+      for (int32 i = 0; i < minibatch_size; i++) {
+        egs_to_merge[i].Swap(vec[i]);
+        delete vec[i];  // we owned those pointers.
+      }
+      vec.erase(vec.begin(), vec.begin() + minibatch_size);
+      WriteMinibatch(&egs_to_merge);
+    }
+    if (!vec.empty()) {
+      int32 eg_size = GetNnetChainExampleSize(*(vec[0]));
+      NnetChainExampleStructureHasher eg_hasher;
+      size_t structure_hash = eg_hasher(*(vec[0]));
+      int32 num_discarded = vec.size();
+      stats_.DiscardedExamples(eg_size, structure_hash, num_discarded);
+      for (int32 i = 0; i < num_discarded; i++)
+        delete vec[i];
+      vec.clear();
+    }
+  }
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 9be298074a4..87b2de77897 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -25,6 +25,7 @@
 #include "hmm/posterior.h"
 #include "util/table-types.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 #include "chain/chain-supervision.h"
 
 namespace kaldi {
@@ -130,6 +131,31 @@ struct NnetChainExample {
   }
 };
 
+/// This hashing object hashes just the structural aspects of the NnetExample
+/// without looking at the value of the features.  It will be used in combining
+/// egs into batches of all similar structure.
+struct NnetChainExampleStructureHasher {
+  size_t operator () (const NnetChainExample &eg) const;
+  // We also provide a version of this that works from pointers.
+  size_t operator () (const NnetChainExample *eg) const {
+    return (*this)(*eg);
+  }
+};
+
+
+/// This comparator object compares just the structural aspects of the
+/// NnetChainExample without looking at the value of the features.
+struct NnetChainExampleStructureCompare {
+  bool operator () (const NnetChainExample &a,
+                    const NnetChainExample &b) const;
+  // We also provide a version of this that works from pointers.
+  bool operator () (const NnetChainExample *a,
+                    const NnetChainExample *b) const {
+    return (*this)(*a, *b);
+  }
+};
+
+
 
 /// This function merges a list of NnetChainExample objects into a single one--
 /// intended to be used when forming minibatches for neural net training.  If
@@ -200,6 +226,60 @@ typedef TableWriter<KaldiObjectHolder<NnetChainExample > > NnetChainExampleWrite
 typedef SequentialTableReader<KaldiObjectHolder<NnetChainExample > > SequentialNnetChainExampleReader;
 typedef RandomAccessTableReader<KaldiObjectHolder<NnetChainExample > > RandomAccessNnetChainExampleReader;
 
+
+/// This function returns the 'size' of a chain example as defined for purposes
+/// of merging egs, which is defined as the largest number of Indexes in any of
+/// the inputs or outputs of the example.
+int32 GetChainNnetExampleSize(const NnetChainExample &a);
+
+
+/// This class is responsible for arranging examples in groups that have the
+/// same strucure (i.e. the same input and output indexes), and outputting them
+/// in suitable minibatches as defined by ExampleMergingConfig.
+class ChainExampleMerger {
+ public:
+  ChainExampleMerger(const ExampleMergingConfig &config,
+                     NnetChainExampleWriter *writer);
+
+  // This function accepts an example, and if possible, writes a merged example
+  // out.  The ownership of the pointer 'a' is transferred to this class when
+  // you call this function.
+  void AcceptExample(NnetChainExample *a);
+
+  // This function announces to the class that the input has finished, so it
+  // should flush out any smaller-sizes minibatches, as dictated by the config.
+  // This will be called in the destructor, but you can call it explicitly when
+  // all the input is done if you want to.
+  // It also prints the stats.
+  void Finish();
+
+  // returns a suitable exit status for a program.
+  bool ExitStatus() { return num_egs_written_ > 0; }
+
+  ~ChainExampleMerger() { Finish(); };
+ private:
+  // called by Finish() and AcceptExample().  Merges, updates the stats, and
+  // writes.  The 'egs' is non-const only because the egs are temporarily
+  // changed inside MergeChainEgs.  The pointer 'egs' is still owned
+  // by the caller.
+  void WriteMinibatch(std::vector<NnetChainExample> *egs);
+
+  bool finished_;
+  int32 num_egs_written_;
+  const ExampleMergingConfig &config_;
+  NnetChainExampleWriter *writer_;
+  ExampleSizeStats stats_;
+
+  // Note: the "key" into the egs is the first element of the vector.
+  typedef unordered_map<NnetChainExample*,
+                        std::vector<NnetChainExample*>,
+                        NnetChainExampleStructureHasher,
+                        NnetChainExampleStructureCompare> MapType;
+MapType eg_to_egs_;
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index 5c02998cbcf..debc91b96c9 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -249,13 +249,15 @@ void MergeSupervision(
 }
 
 
-void MergeDiscriminativeExamples(bool compress,
-                        std::vector<NnetDiscriminativeExample> *input,
-                        NnetDiscriminativeExample *output) {
+void MergeDiscriminativeExamples(
+    bool compress,
+    std::vector<NnetDiscriminativeExample> *input,
+    NnetDiscriminativeExample *output) {
   int32 num_examples = input->size();
   KALDI_ASSERT(num_examples > 0);
-  // we temporarily make the input-features in 'input' look like regular NnetExamples,
-  // so that we can recycle the MergeExamples() function.
+  // we temporarily make the input-features in 'input' look like regular
+  // NnetExamples, so that we can recycle the
+  // MergeExamples() function.
   std::vector<NnetExample> eg_inputs(num_examples);
   for (int32 i = 0; i < num_examples; i++)
     eg_inputs[i].io.swap((*input)[i].inputs);
@@ -414,6 +416,160 @@ void ShiftDiscriminativeExampleTimes(int32 frame_shift,
   }
 }
 
+size_t NnetDiscriminativeExampleStructureHasher::operator () (
+    const NnetDiscriminativeExample &eg) const {
+  // these numbers were chosen at random from a list of primes.
+  NnetIoStructureHasher io_hasher;
+  size_t size = eg.inputs.size(), ans = size * 35099;
+  for (size_t i = 0; i < size; i++)
+    ans = ans * 19157 + io_hasher(eg.inputs[i]);
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    const NnetDiscriminativeSupervision &sup = eg.outputs[i];
+    StringHasher string_hasher;
+    IndexVectorHasher indexes_hasher;
+    ans = ans * 17957 +
+        string_hasher(sup.name) + indexes_hasher(sup.indexes);
+  }
+  return ans;
+}
+
+bool NnetDiscriminativeExampleStructureCompare::operator () (
+    const NnetDiscriminativeExample &a,
+    const NnetDiscriminativeExample &b) const {
+  NnetIoStructureCompare io_compare;
+  if (a.inputs.size() != b.inputs.size() ||
+      a.outputs.size() != b.outputs.size())
+    return false;
+  size_t size = a.inputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (!io_compare(a.inputs[i], b.inputs[i]))
+      return false;
+  size = a.outputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (a.outputs[i].name != b.outputs[i].name ||
+        a.outputs[i].indexes != b.outputs[i].indexes)
+      return false;
+  return true;
+}
+
+
+int32 GetNnetDiscriminativeExampleSize(const NnetDiscriminativeExample &a) {
+  int32 ans = 0;
+  for (size_t i = 0; i < a.inputs.size(); i++) {
+    int32 s = a.inputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  for (size_t i = 0; i < a.outputs.size(); i++) {
+    int32 s = a.outputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  return ans;
+}
+
+
+DiscriminativeExampleMerger::DiscriminativeExampleMerger(const ExampleMergingConfig &config,
+                             NnetDiscriminativeExampleWriter *writer):
+    finished_(false), num_egs_written_(0),
+    config_(config), writer_(writer) { }
+
+
+void DiscriminativeExampleMerger::AcceptExample(NnetDiscriminativeExample *eg) {
+  KALDI_ASSERT(!finished_);
+  // If an eg with the same structure as 'eg' is already a key in the
+  // map, it won't be replaced, but if it's new it will be made
+  // the key.  Also we remove the key before making the vector empty.
+  // This way we ensure that the eg in the key is always the first
+  // element of the vector.
+  std::vector<NnetDiscriminativeExample*> &vec = eg_to_egs_[eg];
+  vec.push_back(eg);
+  int32 eg_size = GetNnetDiscriminativeExampleSize(*eg),
+      num_available = vec.size();
+  bool input_ended = false;
+  int32 minibatch_size = config_.MinibatchSize(eg_size, num_available,
+                                               input_ended);
+  if (minibatch_size != 0) {  // we need to write out a merged eg.
+    KALDI_ASSERT(minibatch_size == num_available);
+
+    std::vector<NnetDiscriminativeExample*> vec_copy(vec);
+    eg_to_egs_.erase(eg);
+
+    // MergeDiscriminativeExamples() expects a vector of NnetDiscriminativeExample, not of pointers,
+    // so use swap to create that without doing any real work.
+    std::vector<NnetDiscriminativeExample> egs_to_merge(minibatch_size);
+    for (int32 i = 0; i < minibatch_size; i++) {
+      egs_to_merge[i].Swap(vec[i]);
+      delete vec[i];  // we owned those pointers.
+    }
+    WriteMinibatch(&egs_to_merge);
+  }
+}
+
+void DiscriminativeExampleMerger::WriteMinibatch(
+    std::vector<NnetDiscriminativeExample> *egs) {
+  KALDI_ASSERT(!egs->empty());
+  int32 eg_size = GetNnetDiscriminativeExampleSize((*egs)[0]);
+  NnetDiscriminativeExampleStructureHasher eg_hasher;
+  size_t structure_hash = eg_hasher((*egs)[0]);
+  int32 minibatch_size = egs->size();
+  stats_.WroteExample(eg_size, structure_hash, minibatch_size);
+  NnetDiscriminativeExample merged_eg;
+  MergeDiscriminativeExamples(config_.compress, egs, &merged_eg);
+  std::ostringstream key;
+  key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
+  writer_->Write(key.str(), merged_eg);
+}
+
+void DiscriminativeExampleMerger::Finish() {
+  if (finished_) return;  // already finished.
+  finished_ = true;
+
+  // we'll convert the map eg_to_egs_ to a vector of vectors to avoid
+  // iterator invalidation problems.
+  std::vector<std::vector<NnetDiscriminativeExample*> > all_egs;
+  all_egs.reserve(eg_to_egs_.size());
+
+  MapType::iterator iter = eg_to_egs_.begin(), end = eg_to_egs_.end();
+  for (; iter != end; ++iter)
+    all_egs.push_back(iter->second);
+  eg_to_egs_.clear();
+
+  for (size_t i = 0; i < all_egs.size(); i++) {
+    int32 minibatch_size;
+    std::vector<NnetDiscriminativeExample*> &vec = all_egs[i];
+    KALDI_ASSERT(!vec.empty());
+    int32 eg_size = GetNnetDiscriminativeExampleSize(*(vec[0]));
+    bool input_ended = true;
+    while (!vec.empty() &&
+           (minibatch_size = config_.MinibatchSize(eg_size, vec.size(),
+                                                   input_ended)) != 0) {
+      // MergeDiscriminativeExamples() expects a vector of
+      // NnetDiscriminativeExample, not of pointers, so use swap to create that
+      // without doing any real work.
+      std::vector<NnetDiscriminativeExample> egs_to_merge(minibatch_size);
+      for (int32 i = 0; i < minibatch_size; i++) {
+        egs_to_merge[i].Swap(vec[i]);
+        delete vec[i];  // we owned those pointers.
+      }
+      vec.erase(vec.begin(), vec.begin() + minibatch_size);
+      WriteMinibatch(&egs_to_merge);
+    }
+    if (!vec.empty()) {
+      int32 eg_size = GetNnetDiscriminativeExampleSize(*(vec[0]));
+      NnetDiscriminativeExampleStructureHasher eg_hasher;
+      size_t structure_hash = eg_hasher(*(vec[0]));
+      int32 num_discarded = vec.size();
+      stats_.DiscardedExamples(eg_size, structure_hash, num_discarded);
+      for (int32 i = 0; i < num_discarded; i++)
+        delete vec[i];
+      vec.clear();
+    }
+  }
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index bb60f216a82..3a170e6bbd6 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -26,6 +26,7 @@
 #include "util/table-types.h"
 #include "nnet3/discriminative-supervision.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 #include "hmm/posterior.h"
 #include "hmm/transition-model.h"
 
@@ -128,6 +129,32 @@ struct NnetDiscriminativeExample {
   }
 };
 
+
+/// This hashing object hashes just the structural aspects of the NnetExample
+/// without looking at the value of the features.  It will be used in combining
+/// egs into batches of all similar structure.
+struct NnetDiscriminativeExampleStructureHasher {
+  size_t operator () (const NnetDiscriminativeExample &eg) const;
+  // We also provide a version of this that works from pointers.
+  size_t operator () (const NnetDiscriminativeExample *eg) const {
+    return (*this)(*eg);
+  }
+};
+
+
+/// This comparator object compares just the structural aspects of the
+/// NnetDiscriminativeExample without looking at the value of the features.
+struct NnetDiscriminativeExampleStructureCompare {
+  bool operator () (const NnetDiscriminativeExample &a,
+                    const NnetDiscriminativeExample &b) const;
+  // We also provide a version of this that works from pointers.
+  bool operator () (const NnetDiscriminativeExample *a,
+                    const NnetDiscriminativeExample *b) const {
+    return (*this)(*a, *b);
+  }
+};
+
+
 /**
   Appends the given vector of examples (which must be non-empty) into
   a single output example.
@@ -140,13 +167,12 @@ struct NnetDiscriminativeExample {
   MergeExamples() routine while avoiding having to rewrite code.
 */
 void MergeDiscriminativeExamples(
-    bool compress,
     std::vector<NnetDiscriminativeExample> *input,
+    bool compress,
     NnetDiscriminativeExample *output);
 
 // called from MergeDiscriminativeExamples, this function merges the Supervision
 // objects into one.  Requires (and checks) that they all have the same name.
-
 void MergeSupervision(
     const std::vector<const NnetDiscriminativeSupervision*> &inputs,
     NnetDiscriminativeSupervision *output);
@@ -194,11 +220,63 @@ void GetDiscriminativeComputationRequest(const Nnet &nnet,
                                          bool use_xent_derivative,
                                          ComputationRequest *computation_request);
 
-
 typedef TableWriter<KaldiObjectHolder<NnetDiscriminativeExample > > NnetDiscriminativeExampleWriter;
 typedef SequentialTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > SequentialNnetDiscriminativeExampleReader;
 typedef RandomAccessTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > RandomAccessNnetDiscriminativeExampleReader;
 
+
+/// This function returns the 'size' of a discriminative example as defined for
+/// purposes of merging egs, which is defined as the largest number of Indexes
+/// in any of the inputs or outputs of the example.
+int32 GetDiscriminativeNnetExampleSize(const NnetDiscriminativeExample &a);
+
+
+/// This class is responsible for arranging examples in groups that have the
+/// same strucure (i.e. the same input and output indexes), and outputting them
+/// in suitable minibatches as defined by ExampleMergingConfig.
+class DiscriminativeExampleMerger {
+ public:
+  DiscriminativeExampleMerger(const ExampleMergingConfig &config,
+                              NnetDiscriminativeExampleWriter *writer);
+
+  // This function accepts an example, and if possible, writes a merged example
+  // out.  The ownership of the pointer 'a' is transferred to this class when
+  // you call this function.
+  void AcceptExample(NnetDiscriminativeExample *a);
+
+  // This function announces to the class that the input has finished, so it
+  // should flush out any smaller-sizes minibatches, as dictated by the config.
+  // This will be called in the destructor, but you can call it explicitly when
+  // all the input is done if you want to.
+  // It also prints the stats.
+  void Finish();
+
+  // returns a suitable exit status for a program.
+  bool ExitStatus() { return num_egs_written_ > 0; }
+
+  ~DiscriminativeExampleMerger() { Finish(); };
+ private:
+  // called by Finish() and AcceptExample().  Merges, updates the stats, and
+  // writes.  The 'egs' is non-const only because the egs are temporarily
+  // changed inside MergeDiscriminativeEgs.  The pointer 'egs' is still owned
+  // by the caller.
+  void WriteMinibatch(std::vector<NnetDiscriminativeExample> *egs);
+
+  bool finished_;
+  int32 num_egs_written_;
+  const ExampleMergingConfig &config_;
+  NnetDiscriminativeExampleWriter *writer_;
+  ExampleSizeStats stats_;
+
+  // Note: the "key" into the egs is the first element of the vector.
+  typedef unordered_map<NnetDiscriminativeExample*,
+                        std::vector<NnetDiscriminativeExample*>,
+                        NnetDiscriminativeExampleStructureHasher,
+                        NnetDiscriminativeExampleStructureCompare> MapType;
+   MapType eg_to_egs_;
+};
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index e88eff71e77..77395759d8d 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -815,6 +815,23 @@ void UtteranceSplitter::SetOutputWeights(
   }
 }
 
+int32 ExampleMergingConfig::IntSet::LargestValueInRange(int32 max_value) const {
+  KALDI_ASSERT(!ranges.empty());
+  int32 ans = 0, num_ranges = ranges.size();
+  for (int32 i = 0; i < num_ranges; i++) {
+    int32 possible_ans = 0;
+    if (max_value >= ranges[i].first) {
+      if (max_value >= ranges[i].second)
+        possible_ans = ranges[i].second;
+      else
+        possible_ans = max_value;
+    }
+    if (possible_ans > ans)
+      ans = possible_ans;
+  }
+  return ans;
+}
+
 // static
 bool ExampleMergingConfig::ParseIntSet(const std::string &str,
                                        ExampleMergingConfig::IntSet *int_set) {
@@ -1166,9 +1183,6 @@ void ExampleMerger::Finish() {
       vec.clear();
     }
   }
-
-
-
 }
 
 
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 75a47772fda..46d6906ff99 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -458,6 +458,7 @@ class ExampleSizeStats {
 /// indexes), and outputting them in suitable minibatches
 /// as defined by ExampleMergingConfig.
 class ExampleMerger {
+ public:
   ExampleMerger(const ExampleMergingConfig &config,
                 NnetExampleWriter *writer);
 
@@ -473,6 +474,9 @@ class ExampleMerger {
   // It also prints the stats.
   void Finish();
 
+  // returns a suitable exit status for a program.
+  bool ExitStatus() { return num_egs_written_ > 0; }
+
   ~ExampleMerger() { Finish(); };
  private:
   // called by Finish() and AcceptExample().  Merges, updates the
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
index 5c386bd40b3..0edf960fdf9 100644
--- a/src/nnet3bin/nnet3-discriminative-merge-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -41,14 +41,12 @@ int main(int argc, char *argv[]) {
         "nnet3-discriminative-merge-egs --minibatch-size=128 ark:1.degs ark:- | nnet3-discriminative-train ... \n"
         "See also nnet3-discriminative-copy-egs\n";
 
-    bool compress = false;
-    int32 minibatch_size = 64;
-
+    ExampleMergingConfig merging_config;
+    merging_config.minibatch_size = 64;  // change the default for this
+                                         // program.. anyway it will usually be
+                                         // set on the command line.
     ParseOptions po(usage);
-    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
-                "when merging (see also --measure-output-frames)");
-    po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk");
+    merging_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -63,35 +61,14 @@ int main(int argc, char *argv[]) {
     SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
     NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
 
-    std::vector<NnetDiscriminativeExample> examples;
-    examples.reserve(minibatch_size);
-
-    int64 num_read = 0, num_written = 0;
+    DiscriminativeExampleMerger merger(merging_config, &example_writer);
     while (!example_reader.Done()) {
       const NnetDiscriminativeExample &cur_eg = example_reader.Value();
-      examples.resize(examples.size() + 1);
-      examples.back() = cur_eg;
-
-      bool minibatch_ready =
-          static_cast<int32>(examples.size()) >= minibatch_size;
-
-      // Do Next() now, so we can test example_reader.Done() below .
-      example_reader.Next();
-      num_read++;
-
-      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
-        NnetDiscriminativeExample merged_eg;
-        MergeDiscriminativeExamples(compress, &examples, &merged_eg);
-        std::ostringstream ostr;
-        ostr << "merged-" << num_written;
-        num_written++;
-        std::string output_key = ostr.str();
-        example_writer.Write(output_key, merged_eg);
-        examples.clear();
-      }
+      merger.AcceptExample(new NnetDiscriminativeExample(cur_eg));
     }
-    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
-    return (num_written != 0 ? 0 : 1);
+    // the merger itself prints the necessary diagnostics.
+    merger.Finish();
+    return merger.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 48ba2986512..33a65d140f2 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -59,23 +59,10 @@ int main(int argc, char *argv[]) {
         "nnet3-merge-egs --minibatch-size=512 ark:1.egs ark:- | nnet3-train-simple ... \n"
         "See also nnet3-copy-egs\n";
 
-    bool compress = false;
-    int32 minibatch_size = 512;
-    bool measure_output_frames = true;
-    bool discard_partial_minibatches = false;
-
     ParseOptions po(usage);
-    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
-                "when merging (see also --measure-output-frames)");
-    po.Register("measure-output-frames", &measure_output_frames, "If true, "
-                "--minibatch-size is a target number of total output frames; if "
-                "false, --minibatch-size is the number of input examples to "
-                "merge.");
-    po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk)");
-    po.Register("discard-partial-minibatches", &discard_partial_minibatches,
-		"discard any partial minibatches of 'uneven' size that may be "
-		"encountered at the end.");
+
+    ExampleMergingConfig merging_config;
+    merging_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -87,44 +74,20 @@ int main(int argc, char *argv[]) {
     std::string examples_rspecifier = po.GetArg(1),
         examples_wspecifier = po.GetArg(2);
 
+    merging_config.ComputeDerived();
+
     SequentialNnetExampleReader example_reader(examples_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
 
-    std::vector<NnetExample> examples;
-    examples.reserve(minibatch_size);
-
-    int32 cur_num_output_frames = 0;
+    ExampleMerger merger(merging_config, &example_writer);
 
-    int64 num_read = 0, num_written = 0;
     while (!example_reader.Done()) {
       const NnetExample &cur_eg = example_reader.Value();
-      examples.resize(examples.size() + 1);
-      examples.back() = cur_eg;
-      cur_num_output_frames += NumOutputIndexes(cur_eg);
-      bool minibatch_ready =
-          (measure_output_frames ?
-           cur_num_output_frames >= minibatch_size :
-           static_cast<int32>(examples.size()) >= minibatch_size);
-
-      // Do Next() now, so we can test example_reader.Done() below .
-      example_reader.Next();
-      num_read++;
-
-      if (minibatch_ready || (!discard_partial_minibatches &&
-			      (example_reader.Done() && !examples.empty()))) {
-        NnetExample merged_eg;
-        MergeExamples(examples, compress, &merged_eg);
-        std::ostringstream ostr;
-        ostr << "merged-" << num_written;
-        num_written++;
-        std::string output_key = ostr.str();
-        example_writer.Write(output_key, merged_eg);
-        examples.clear();
-        cur_num_output_frames = 0;
-      }
+      merger.AcceptExample(new NnetExample(cur_eg));
     }
-    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
-    return (num_written != 0 ? 0 : 1);
+    // the merger itself prints the necessary diagnostics.
+    merger.Finish();
+    return merger.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/util/kaldi-holder-inl.h b/src/util/kaldi-holder-inl.h
index 4297af9a2e2..5768d4c6b03 100644
--- a/src/util/kaldi-holder-inl.h
+++ b/src/util/kaldi-holder-inl.h
@@ -97,6 +97,7 @@ template<class KaldiType> class KaldiObjectHolder {
   }
 
   void Swap(KaldiObjectHolder<T> *other) {
+    // the t_ values are pointers so this is a shallow swap.
     std::swap(t_, other->t_);
   }
 

From 9f833f97ac517c808cb2eddc8a4507c13e539d34 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 29 Dec 2016 15:51:26 -0800
Subject: [PATCH 033/213] [scripts][nnet3] Remove valid-{left,right}-context
 and priors-{left,right}-context from scripts, making them the same as regular
 context; use regular chunk-width for validation egs, not 1.

---
 .../s5/local/chain/run_tdnn_discriminative.sh | 32 +++++++-------
 .../tuning/run_blstm_6h_discriminative.sh     | 32 +++++++-------
 .../tuning/run_tdnn_6h_discriminative.sh      | 30 ++++++-------
 .../nnet3/train/chain_objf/acoustic_model.py  |  9 ----
 .../train/frame_level_objf/acoustic_model.py  |  5 ---
 .../nnet3/train/frame_level_objf/raw_model.py |  5 ---
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh       | 29 +++++--------
 egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh    |  4 ++
 egs/wsj/s5/steps/nnet3/get_egs.sh             | 28 +++++--------
 .../s5/steps/nnet3/get_egs_discriminative.sh  | 40 +++++-------------
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh     | 42 ++++++++-----------
 egs/wsj/s5/steps/nnet3/lstm/train.sh          |  5 ++-
 egs/wsj/s5/steps/nnet3/tdnn/train.sh          |  3 ++
 egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh |  3 ++
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  1 -
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  1 -
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  2 -
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  2 -
 egs/wsj/s5/steps/nnet3/train_tdnn.sh          |  3 ++
 19 files changed, 105 insertions(+), 171 deletions(-)

diff --git a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
index bda883f16c2..f4d40884058 100755
--- a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
@@ -10,7 +10,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 
 
 stage=0
@@ -59,8 +59,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -95,7 +95,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
     rm ${train_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
 
     data_dirs=
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
         $x $train_data_dir exp/shift_hires mfcc_hires
       utils/fix_data_dir.sh ${train_data_dir}_fs$x
@@ -103,7 +103,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
       awk -v nfs=$x '{print "fs"nfs"-"$0}' $train_ivector_dir/ivector_online.scp >> ${train_ivector_dir}_fs/ivector_online.scp
     done
     utils/combine_data.sh ${train_data_dir}_fs $data_dirs
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       rm -r ${train_data_dir}_fs$x
     done
   fi
@@ -112,9 +112,9 @@ if [ $frame_subsampling_factor -ne 1 ]; then
 
   affix=_fs
 fi
-    
+
 rm ${train_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
-for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
   awk -v nfs=$x '{print "fs"nfs"-"$0}' $train_ivector_dir/ivector_online.scp >> ${train_ivector_dir}_fs/ivector_online.scp
 done
 train_ivector_dir=${train_ivector_dir}_fs
@@ -133,7 +133,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats${affix}
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -147,16 +147,13 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs${affix}
@@ -176,8 +173,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $train_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
@@ -202,7 +198,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$[x*frame_subsampling_factor]
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
@@ -219,7 +215,7 @@ if [ $stage -le 5 ]; then
     done
   done
   wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
 fi
 
 if [ $stage -le 6 ] && $cleanup; then
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
index b0264c17d8b..e3884df8711 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
@@ -8,7 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . cmd.sh
 
 
@@ -52,7 +52,7 @@ effective_learning_rate=0.000000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 
 ## Decode options
@@ -60,8 +60,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -102,7 +102,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
     rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
 
     data_dirs=
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
         $x $train_data_dir exp/shift_hires/ mfcc_hires
       utils/fix_data_dir.sh ${train_data_dir}_fs$x
@@ -110,7 +110,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
       awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
     done
     utils/combine_data.sh ${train_data_dir}_fs $data_dirs
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       rm -r ${train_data_dir}_fs$x
     done
   fi
@@ -119,9 +119,9 @@ if [ $frame_subsampling_factor -ne 1 ]; then
 
   affix=_fs
 fi
-    
+
 rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
-for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
   awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
 done
 online_ivector_dir=${online_ivector_dir}_fs
@@ -140,7 +140,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats${affix}
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -154,16 +154,13 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs${affix}
@@ -183,8 +180,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
@@ -210,7 +206,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
index 85afa7bf9ca..9a7c4ca2859 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
@@ -85,7 +85,7 @@ effective_learning_rate=0.000000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 
 ## Decode options
@@ -93,8 +93,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -129,7 +129,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
     rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
 
     data_dirs=
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
         $x $train_data_dir exp/shift_hires/ mfcc_hires
       utils/fix_data_dir.sh ${train_data_dir}_fs$x
@@ -137,7 +137,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
       awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
     done
     utils/combine_data.sh ${train_data_dir}_fs $data_dirs
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       rm -r ${train_data_dir}_fs$x
     done
   fi
@@ -146,9 +146,9 @@ if [ $frame_subsampling_factor -ne 1 ]; then
 
   affix=_fs
 fi
-    
+
 rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
-for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
   awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
 done
 online_ivector_dir=${online_ivector_dir}_fs
@@ -167,7 +167,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats${affix}
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -181,16 +181,13 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs${affix}
@@ -210,8 +207,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
@@ -237,7 +233,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 0d20b7c3287..95fbb073c15 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -52,7 +52,6 @@ def create_denominator_fst(dir, tree_dir, run_opts):
 def generate_chain_egs(dir, data, lat_dir, egs_dir,
                        left_context, right_context,
                        run_opts, stage=0,
-                       valid_left_context=None, valid_right_context=None,
                        left_tolerance=None, right_tolerance=None,
                        frame_subsampling_factor=3,
                        alignment_subsampling_factor=3,
@@ -72,8 +71,6 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
                 --left-context {left_context} --right-context {right_context} \
-                --valid-left-context '{valid_left_context}' \
-                --valid-right-context '{valid_right_context}' \
                 --left-tolerance '{left_tolerance}' \
                 --right-tolerance '{right_tolerance}' \
                 --frame-subsampling-factor {frame_subsampling_factor} \
@@ -93,12 +90,6 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                                  if online_ivector_dir is not None
                                  else ''),
                     left_context=left_context, right_context=right_context,
-                    valid_left_context=(valid_left_context
-                                        if valid_left_context is not None
-                                        else ''),
-                    valid_right_context=(valid_right_context
-                                         if valid_right_context is not None
-                                         else ''),
                     left_tolerance=(left_tolerance
                                     if left_tolerance is not None
                                     else ''),
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index 1360f669f41..f3104f93089 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -20,7 +20,6 @@
 
 def generate_egs(data, alidir, egs_dir,
                  left_context, right_context,
-                 valid_left_context, valid_right_context,
                  run_opts, stage=0,
                  feat_type='raw', online_ivector_dir=None,
                  samples_per_iter=20000, frames_per_eg=20, srand=0,
@@ -40,8 +39,6 @@ def generate_egs(data, alidir, egs_dir,
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
                 --left-context {left_context} --right-context {right_context} \
-                --valid-left-context {valid_left_context} \
-                --valid-right-context {valid_right_context} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
                 --frames-per-eg {frames_per_eg} \
@@ -57,8 +54,6 @@ def generate_egs(data, alidir, egs_dir,
                                 if online_ivector_dir is not None
                                 else ''),
                    left_context=left_context, right_context=right_context,
-                   valid_left_context=valid_left_context,
-                   valid_right_context=valid_right_context,
                    stage=stage, samples_per_iter=samples_per_iter,
                    frames_per_eg=frames_per_eg, srand=srand, data=data,
                    alidir=alidir, egs_dir=egs_dir,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index 58240dd2f1b..0fe8e3d4927 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -19,7 +19,6 @@
 
 def generate_egs_using_targets(data, targets_scp, egs_dir,
                                left_context, right_context,
-                               valid_left_context, valid_right_context,
                                run_opts, stage=0,
                                feat_type='raw', online_ivector_dir=None,
                                target_type='dense', num_targets=-1,
@@ -56,8 +55,6 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
                 --left-context {left_context} --right-context {right_context} \
-                --valid-left-context {valid_left_context} \
-                --valid-right-context {valid_right_context} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
                 --frames-per-eg {frames_per_eg} \
@@ -75,8 +72,6 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                                 if online_ivector_dir is not None
                                 else ''),
                    left_context=left_context, right_context=right_context,
-                   valid_left_context=valid_left_context,
-                   valid_right_context=valid_right_context,
                    stage=stage, samples_per_iter=samples_per_iter,
                    frames_per_eg=frames_per_eg, srand=srand,
                    num_targets=num_targets,
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 7b330f8f717..49ec694a19d 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -33,10 +33,6 @@ alignment_subsampling_factor=3 # frames-per-second of input alignments divided
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
@@ -292,20 +288,14 @@ if [ $stage -le 2 ]; then
 fi
 
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
 
-
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-# don't do the overlap thing for the validation data.
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
-
-ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
+chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
 [ ! -z $right_tolerance ] && \
-  ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance"
+  chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance"
 
 [ ! -z $left_tolerance ] && \
-  ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance"
+  chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
@@ -320,17 +310,17 @@ if [ $stage -le 3 ]; then
 
   $cmd $dir/log/create_valid_subset.log \
     lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
-    chain-get-supervision $ctc_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
+    chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
       ark:- ark:- \| \
     nnet3-chain-get-egs $valid_ivector_opt --srand=$srand \
-      $valid_egs_opts $chaindir/normalization.fst \
+      $egs_opts $chaindir/normalization.fst \
       "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
     lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
-    chain-get-supervision $ctc_supervision_all_opts \
+    chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
     nnet3-chain-get-egs $train_subset_ivector_opt --srand=$srand \
-      $valid_egs_opts $chaindir/normalization.fst \
+      $egs_opts $chaindir/normalization.fst \
       "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
@@ -379,9 +369,10 @@ if [ $stage -le 4 ]; then
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
     utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \
     lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
-    chain-get-supervision $ctc_supervision_all_opts \
+    chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
     nnet3-chain-get-egs $ivector_opt --srand=\$[JOB+$srand] $egs_opts \
+      --num-frames-overlap=$frames_overlap_per_eg \
      "$feats" ark,s,cs:- ark:- \| \
     nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index a8211c5fbc5..ada92e66ff4 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ./train.py
+
 # note, TDNN is the same as what we used to call multisplice.
 # This version of the script, nnet3/chain/train_tdnn.sh, is for 'chain' systems.
 
@@ -102,6 +104,8 @@ right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on th
 
 trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
 
+
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 79bfc25fff6..9992285baaa 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -22,10 +22,6 @@ frames_per_eg=8   # number of frames of labels per example.  more->less disk spa
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
@@ -37,8 +33,8 @@ reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
 num_utts_subset=300     # number of utterances in validation and training
                         # subsets used for shrinkage and diagnostics.
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
-num_train_frames_combine=10000 # # train frames for the above.
-num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+num_train_frames_combine=60000 # # train frames for the above.
+num_frames_diagnostic=10000 # number of frames for "compute_prob" jobs
 samples_per_iter=400000 # this is the target number of egs in each archive of egs
                         # (prior to merging egs).  We probably should have called
                         # it egs_per_iter. This is just a guideline; it will pick
@@ -266,11 +262,7 @@ if [ $stage -le 2 ]; then
     copy-int-vector ark:- ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
 fi
 
-egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
-
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
+egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
@@ -284,28 +276,28 @@ if [ $stage -le 3 ]; then
     <$dir/ali.scp >$dir/ali_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $valid_egs_opts "$valid_feats" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $egs_opts "$valid_feats" \
     "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $egs_opts "$train_subset_feats" \
      "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
      "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
   $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg] ark:$dir/valid_all.egs \
     ark:$dir/valid_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/valid_all.egs \
     ark:$dir/valid_diagnostic.egs || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg] ark:$dir/train_subset_all.egs \
     ark:$dir/train_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/train_subset_all.egs \
     ark:$dir/train_diagnostic.egs || touch $dir/.error &
   wait
   sleep 5  # wait for file system to sync.
@@ -328,7 +320,7 @@ if [ $stage -le 4 ]; then
   echo "$0: Generating training examples on disk"
   # The examples will go round-robin to egs_list.
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts "$feats" \
     "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
     nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index c3baa5dbbc8..0fc8753fa0f 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -21,13 +21,7 @@ frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
 adjust_priors=true
-priors_left_context=   # amount of left_context for priors egs
-priors_right_context=   # amount of right_context for priors egs
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 num_utts_subset=80     # number of utterances in validation and training
@@ -54,7 +48,9 @@ cmvn_opts=  # can be used for specifying CMVN options, if feature type is not ld
             # it doesn't make sense to use different options than were used as input to the
             # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
 
-num_priors_subset=100
+num_priors_subset=1000  #  number of utterances used to calibrate the per-state
+                        #  priors.  Note: these don't have to be held out from
+                        #  the training data.
 num_archives_priors=10
 
 # End configuration section.
@@ -279,37 +275,22 @@ fi
 
 splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt"
 
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-
-[ -z $priors_left_context ] &&  priors_left_context=$left_context;
-[ -z $priors_right_context ] &&  priors_right_context=$right_context;
-
 left_context=$[left_context+frame_subsampling_factor/2]
 right_context=$[right_context+frame_subsampling_factor/2]
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
-
-valid_left_context=$[valid_left_context+frame_subsampling_factor/2]
-valid_right_context=$[valid_right_context+frame_subsampling_factor/2]
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress --frame-subsampling-factor=$frame_subsampling_factor $splitter_opts"
 
-# don't do the overlap thing for the validation data.
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
-
-priors_left_context=$[priors_left_context+frame_subsampling_factor/2]
-priors_right_context=$[priors_right_context+frame_subsampling_factor/2]
-
-# don't do the overlap thing for the priors computation data.
-priors_egs_opts="--left-context=$priors_left_context --right-context=$priors_right_context --num-frames=1 --compress=$compress"
+# don't do the overlap thing for the priors computation data-- but do use the
+# same num-frames for the eg, which would be much more efficient in case it's a
+# recurrent model and has a lot of frames of context.  In any case we're not
+# doing SGD so there is no benefit in having short chunks.
+priors_egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress"
 
 supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
 
-echo $priors_left_context > $dir/info/priors_left_context
-echo $priors_right_context > $dir/info/priors_right_context
-
 echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor
 
 
@@ -368,7 +349,7 @@ if [ $stage -le 4 ]; then
   $cmd $dir/log/create_valid_subset.log \
     discriminative-get-supervision $supervision_all_opts \
     scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
-    nnet3-discriminative-get-egs $valid_ivector_opt $valid_egs_opts \
+    nnet3-discriminative-get-egs $valid_ivector_opt $egs_opts \
     $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset.log \
@@ -405,6 +386,7 @@ if [ $stage -le 5 ]; then
     "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
     "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \
     nnet3-discriminative-get-egs $ivector_opt $egs_opts \
+       --num-frames-overlap=$frames_overlap_per_eg \
     $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \
     nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 309c89cf99d..5221916e5c0 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).
 #           2015-2016 Vimal Manohar
 # Apache 2.0.
 
 # This script is similar to steps/nnet3/get_egs.sh but used
-# when getting general targets (not from alignment directory) for raw nnet 
+# when getting general targets (not from alignment directory) for raw nnet
 #
 # This script, which will generally be called from other neural-net training
 # scripts, extracts the training examples used to train the neural net (and also
@@ -21,7 +21,7 @@
 # Begin configuration section.
 cmd=run.pl
 feat_type=raw       # set it to 'lda' to use LDA features.
-target_type=sparse  # dense to have dense targets, 
+target_type=sparse  # dense to have dense targets,
                     # sparse to have posteriors targets
 num_targets=        # required for target-type=sparse with raw nnet
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
@@ -30,10 +30,6 @@ frames_per_eg=8   # number of frames of labels per example.  more->less disk spa
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
@@ -45,15 +41,15 @@ reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
 num_utts_subset=300     # number of utterances in validation and training
                         # subsets used for shrinkage and diagnostics.
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
-num_train_frames_combine=10000 # # train frames for the above.
-num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+num_train_frames_combine=60000 # # train frames for the above.
+num_frames_diagnostic=10000 # number of frames for "compute_prob" jobs
 samples_per_iter=400000 # this is the target number of egs in each archive of egs
                         # (prior to merging egs).  We probably should have called
                         # it egs_per_iter. This is just a guideline; it will pick
                         # a number that divides the number of samples in the
                         # entire data.
 
-transform_dir=     
+transform_dir=
 
 stage=0
 nj=6         # This should be set to the maximum number of jobs you are
@@ -254,11 +250,7 @@ if [ -e $dir/storage ]; then
   done
 fi
 
-egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
-
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
+egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
@@ -274,12 +266,12 @@ if [ $target_type == "dense" ]; then
 fi
 
 if [ -z "$num_targets" ]; then
-  echo "$0: num-targets is not set" 
+  echo "$0: num-targets is not set"
   exit 1
 fi
 
 case $target_type in
-  "dense") 
+  "dense")
     get_egs_program="nnet3-get-egs-dense-targets --num-targets=$num_targets"
 
     targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | copy-feats scp:- ark:- |"
@@ -289,7 +281,7 @@ case $target_type in
   "sparse")
     get_egs_program="nnet3-get-egs --num-pdfs=$num_targets"
     targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |"
-    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" 
+    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |"
     train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |"
     ;;
   default)
@@ -302,29 +294,29 @@ if [ $stage -le 3 ]; then
   rm -f $dir/.error 2>/dev/null
   $cmd $dir/log/create_valid_subset.log \
     $get_egs_program \
-    $valid_ivector_opt $valid_egs_opts "$valid_feats" \
+    $valid_ivector_opt $egs_opts "$valid_feats" \
     "$valid_targets" \
     "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
     $get_egs_program \
-    $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
+    $train_subset_ivector_opt $egs_opts "$train_subset_feats" \
     "$train_subset_targets" \
     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
   $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg] ark:$dir/valid_all.egs \
     ark:$dir/valid_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/valid_all.egs \
     ark:$dir/valid_diagnostic.egs || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg] ark:$dir/train_subset_all.egs \
     ark:$dir/train_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/train_subset_all.egs \
     ark:$dir/train_diagnostic.egs || touch $dir/.error &
   wait
   sleep 5  # wait for file system to sync.
@@ -348,7 +340,7 @@ if [ $stage -le 4 ]; then
   # The examples will go round-robin to egs_list.
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
     $get_egs_program \
-    $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" "$targets" \
+    $ivector_opt $egs_opts "$feats" "$targets" \
     ark:- \| \
     nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/nnet3/lstm/train.sh b/egs/wsj/s5/steps/nnet3/lstm/train.sh
index 5be69aacff0..3f9b7bccb06 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/train.sh
+++ b/egs/wsj/s5/steps/nnet3/lstm/train.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ../train_rnn.py
+
 # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
@@ -116,6 +118,7 @@ rand_prune=4.0 # speeds up LDA.
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
@@ -298,8 +301,6 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   extra_opts+=(--transform-dir $transform_dir)
   extra_opts+=(--left-context $left_context)
   extra_opts+=(--right-context $right_context)
-  extra_opts+=(--valid-left-context $((chunk_width + left_context)))
-  extra_opts+=(--valid-right-context $((chunk_width + right_context)))
 
   # Note: in RNNs we process sequences of labels rather than single label per sample
   echo "$0: calling get_egs.sh"
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
index 49eeabcd9a8..dfe02931758 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/train.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ../train_dnn.py
+
 # note, TDNN is the same as what we used to call multisplice.
 
 # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
@@ -83,6 +85,7 @@ subset_dim=0
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
index a04a0e894ac..8fce9ae3831 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ../train_raw_dnn.py
+
 # note, TDNN is the same as what we used to call multisplice.
 # THIS SCRIPT IS DEPRECATED, see ../train_raw_dnn.py
 
@@ -70,6 +72,7 @@ dense_targets=true        # Use dense targets instead of sparse targets
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index ca495654819..ad4f8477689 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -209,7 +209,6 @@ def train(args, run_opts, background_process_handler):
         train_lib.acoustic_model.generate_egs(
             data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context, valid_right_context=right_context,
             run_opts=run_opts,
             frames_per_eg=args.frames_per_eg,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 21cbca64e7a..215b6abef59 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -221,7 +221,6 @@ def train(args, run_opts, background_process_handler):
             data=args.feat_dir, targets_scp=args.targets_scp,
             egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context, valid_right_context=right_context,
             run_opts=run_opts,
             frames_per_eg=args.frames_per_eg,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index e8a48653a5a..ef46d962393 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -302,8 +302,6 @@ def train(args, run_opts, background_process_handler):
             data=args.feat_dir, targets_scp=args.targets_scp,
             egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context + args.chunk_width,
-            valid_right_context=right_context + args.chunk_width,
             run_opts=run_opts,
             frames_per_eg=args.chunk_width,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index d08585fa537..93fd7da0dc4 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -288,8 +288,6 @@ def train(args, run_opts, background_process_handler):
         train_lib.acoustic_model.generate_egs(
             data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context + args.chunk_width,
-            valid_right_context=right_context + args.chunk_width,
             run_opts=run_opts,
             frames_per_eg=args.chunk_width,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index fb7a5a38f49..37540e488c2 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ./train_dnn.py
+
 # note, TDNN is the same as what we used to call multisplice.
 
 # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
@@ -81,6 +83,7 @@ frames_per_eg=8 # to be passed on to get_egs.sh
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi

From 0d5488107ef78f5e066811e77be425f5a05d1aa3 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 29 Dec 2016 18:23:19 -0800
Subject: [PATCH 034/213] Removing the --reduce-frames-per-eg option as a
 simplification prior to changes to frames-per-eg code

---
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh       |  3 +--
 egs/wsj/s5/steps/nnet3/get_egs.sh             | 20 ++++--------------
 .../s5/steps/nnet3/get_egs_discriminative.sh  |  1 -
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh     | 21 ++++---------------
 4 files changed, 9 insertions(+), 36 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 49ec694a19d..9018c2e2472 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -21,8 +21,7 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=25   # number of feature frames example (not counting added context).
                    # more->less disk space and less time preparing egs, but more
-                   # I/O during training.  note: the script may reduce this if
-                   # reduce_frames_per_eg is true.
+                   # I/O during training.
 frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 9992285baaa..5da6b6e0228 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -18,18 +18,12 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
-                  # note: the script may reduce this if reduce_frames_per_eg is true.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
-reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
-                           # if there is only one archive and even with the
-                           # reduced frames_per_eg, the number of
-                           # samples_per_iter that would result is less than or
-                           # equal to the user-specified value.
 num_utts_subset=300     # number of utterances in validation and training
                         # subsets used for shrinkage and diagnostics.
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
@@ -206,17 +200,11 @@ fi
 
 # the + 1 is to round up, not down... we assume it doesn't divide exactly.
 num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
-# (for small data)- while reduce_frames_per_eg == true and the number of
-# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
-# by 1.
-reduced=false
-while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
-  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
-  frames_per_eg=$[$frames_per_eg-1]
-  num_archives=1
-  reduced=true
+if [ $num_archives -eq 1 ]; then
+  echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
+  echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
+  sleep 4
 done
-$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
 
 # We may have to first create a smaller number of larger archives, with number
 # $num_archives_intermediate, if $num_archives is more than the maximum number
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index 0fc8753fa0f..a8b6b3376b5 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -12,7 +12,6 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=150 # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
-                  # note: the script may reduce this if reduce_frames_per_eg is true.
 frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 5221916e5c0..46cf6eb0c20 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -26,18 +26,11 @@ target_type=sparse  # dense to have dense targets,
 num_targets=        # required for target-type=sparse with raw nnet
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
-                  # note: the script may reduce this if reduce_frames_per_eg is true.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
-
-reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
-                           # if there is only one archive and even with the
-                           # reduced frames_per_eg, the number of
-                           # samples_per_iter that would result is less than or
-                           # equal to the user-specified value.
 num_utts_subset=300     # number of utterances in validation and training
                         # subsets used for shrinkage and diagnostics.
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
@@ -200,17 +193,11 @@ fi
 
 # the + 1 is to round up, not down... we assume it doesn't divide exactly.
 num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
-# (for small data)- while reduce_frames_per_eg == true and the number of
-# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
-# by 1.
-reduced=false
-while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
-  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
-  frames_per_eg=$[$frames_per_eg-1]
-  num_archives=1
-  reduced=true
+if [ $num_archives -eq 1 ]; then
+  echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
+  echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
+  sleep 4
 done
-$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
 
 # We may have to first create a smaller number of larger archives, with number
 # $num_archives_intermediate, if $num_archives is more than the maximum number

From ab38f331199b8e93fc4edb1067ee76421faa6db4 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 29 Dec 2016 19:41:07 -0800
Subject: [PATCH 035/213] Modify scripts to use --max-deriv-time-relative
 option in place of --max-deriv-time (removes need to know num-frames)

---
 .../nnet3/train/chain_objf/acoustic_model.py  | 12 +++---
 .../nnet3/train/frame_level_objf/common.py    | 42 +++++++------------
 egs/wsj/s5/steps/nnet3/chain/train.py         |  8 ++--
 egs/wsj/s5/steps/nnet3/get_egs.sh             | 26 +++++++++---
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh     | 26 +++++++++---
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  8 ++--
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  8 ++--
 7 files changed, 72 insertions(+), 58 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 95fbb073c15..d6f6d47b018 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -108,7 +108,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      num_archives_processed, num_archives,
                      raw_model_string, egs_dir, left_context, right_context,
                      apply_deriv_weights,
-                     min_deriv_time, max_deriv_time,
+                     min_deriv_time, max_deriv_time_relative,
                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
                      momentum, max_param_change,
                      shuffle_buffer_size, num_chunk_per_minibatch,
@@ -130,9 +130,9 @@ def train_new_models(dir, iter, srand, num_jobs,
     if min_deriv_time is not None:
         deriv_time_opts.append("--optimization.min-deriv-time={0}".format(
                                     min_deriv_time))
-    if max_deriv_time is not None:
-        deriv_time_opts.append("--optimization.max-deriv-time={0}".format(
-                                    int(max_deriv_time)))
+    if max_deriv_time_relative is not None:
+        deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format(
+                                    int(max_deriv_time_relative)))
 
     processes = []
     for job in range(1, num_jobs+1):
@@ -209,7 +209,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         num_hidden_layers, add_layers_period,
                         left_context, right_context,
                         apply_deriv_weights, min_deriv_time,
-                        max_deriv_time,
+                        max_deriv_time_relative,
                         l2_regularize, xent_regularize,
                         leaky_hmm_coefficient,
                         momentum, max_param_change, shuffle_buffer_size,
@@ -315,7 +315,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      left_context=left_context, right_context=right_context,
                      apply_deriv_weights=apply_deriv_weights,
                      min_deriv_time=min_deriv_time,
-                     max_deriv_time=max_deriv_time,
+                     max_deriv_time_relative=max_deriv_time_relative,
                      l2_regularize=l2_regularize,
                      xent_regularize=xent_regularize,
                      leaky_hmm_coefficient=leaky_hmm_coefficient,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 65a9c105e45..1afa532e914 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -30,7 +30,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      shuffle_buffer_size, minibatch_size,
                      cache_read_opt, run_opts,
                      frames_per_eg=-1,
-                     min_deriv_time=None, max_deriv_time=None):
+                     min_deriv_time=None, max_deriv_time_relative=None):
     """ Called from train_one_iteration(), this model does one iteration of
     training with 'num_jobs' jobs, and writes files like
     exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
@@ -59,9 +59,9 @@ def train_new_models(dir, iter, srand, num_jobs,
     if min_deriv_time is not None:
         deriv_time_opts.append("--optimization.min-deriv-time={0}".format(
                            min_deriv_time))
-    if max_deriv_time is not None:
-        deriv_time_opts.append("--optimization.max-deriv-time={0}".format(
-                           max_deriv_time))
+    if max_deriv_time_relative is not None:
+        deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format(
+                           max_deriv_time_relative))
 
     context_opts = "--left-context={0} --right-context={1}".format(
         left_context, right_context)
@@ -140,8 +140,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         momentum, max_param_change, shuffle_buffer_size,
                         run_opts,
                         cv_minibatch_size=256, frames_per_eg=-1,
-                        min_deriv_time=None, max_deriv_time=None,
-                        shrinkage_value=1.0, dropout_edit_string="",
+                        min_deriv_time=None, max_deriv_time_relative=None,
+                        shrinkage_value=1.0,
                         get_raw_nnet_from_am=True,
                         background_process_handler=None):
     """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
@@ -172,10 +172,9 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     if os.path.exists('{0}/srand'.format(dir)):
         try:
             saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
-        except (IOError, ValueError):
-            logger.error("Exception while reading the random seed "
-                         "for training")
-            raise
+        except (IOError, ValueError) as e:
+            raise Exception("Exception while reading the random seed "
+                            "for training: {0}".format(e.str()))
         if srand != saved_srand:
             logger.warning("The random seed provided to this iteration "
                            "(srand={0}) is different from the one saved last "
@@ -249,8 +248,6 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                                 "{dir}/{iter}.raw - |".format(
                                     lr=learning_rate, dir=dir, iter=iter))
 
-    raw_model_string = raw_model_string + dropout_edit_string
-
     if do_average:
         cur_minibatch_size = minibatch_size
         cur_max_param_change = max_param_change
@@ -268,15 +265,6 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
-    shrink_info_str = ''
-    if shrinkage_value != 1.0:
-        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)
-
-    logger.info("On iteration {0}, learning rate is {1}"
-                "{shrink_info}.".format(
-                    iter, learning_rate,
-                    shrink_info=shrink_info_str))
-
     train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                      num_archives_processed=num_archives_processed,
                      num_archives=num_archives,
@@ -288,7 +276,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      cache_read_opt=cache_read_opt, run_opts=run_opts,
                      frames_per_eg=frames_per_eg,
                      min_deriv_time=min_deriv_time,
-                     max_deriv_time=max_deriv_time)
+                     max_deriv_time_relative=max_deriv_time_relative)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
          num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
@@ -318,8 +306,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         for i in range(1, num_jobs + 1):
             os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
     except OSError:
-        logger.error("Error while trying to delete the raw models")
-        raise
+        raise Exception("Error while trying to delete the raw models")
 
     if get_raw_nnet_from_am:
         new_model = "{0}/{1}.mdl".format(dir, iter + 1)
@@ -369,9 +356,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
         try:
             os.remove(file)
         except OSError:
-            logger.error("There was error while trying to remove "
-                         "lda stat files.")
-            raise
+            raise Exception("There was error while trying to remove "
+                            "lda stat files.")
     # this computes a fixed affine transform computed in the way we described
     # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
     # variant of an LDA transform but without dimensionality reduction.
@@ -479,7 +465,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
 
     models_to_combine.add(num_iters)
 
-    for iter in sorted(models_to_combine):
+    for iter in models_to_combine:
         if get_raw_nnet_from_am:
             model_file = '{0}/{1}.mdl'.format(dir, iter)
             if not os.path.exists(model_file):
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index f658d2a770f..18bc3128bcb 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -395,11 +395,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.final_effective_lrate)
 
     min_deriv_time = None
-    max_deriv_time = None
+    max_deriv_time_relative = None
     if args.deriv_truncate_margin is not None:
         min_deriv_time = -args.deriv_truncate_margin - model_left_context
-        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
-                          + model_right_context)
+        max_deriv_time_relative = \
+           args.deriv_truncate_margin + model_right_context
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -445,7 +445,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 right_context=right_context,
                 apply_deriv_weights=args.apply_deriv_weights,
                 min_deriv_time=min_deriv_time,
-                max_deriv_time=max_deriv_time,
+                max_deriv_time_relative=max_deriv_time_relative,
                 l2_regularize=args.l2_regularize,
                 xent_regularize=args.xent_regularize,
                 leaky_hmm_coefficient=args.leaky_hmm_coefficient,
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 5da6b6e0228..e442dce9032 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -18,6 +18,9 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations (more useful when using large chunks, e.g. for BLSTMs);
+                  # the first one (the principal num-frames) is preferred.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
@@ -66,6 +69,11 @@ if [ $# != 3 ]; then
   echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
   echo "                                                   # to use as input to the neural net."
   echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
+  echo "                                                   # May be either a single number or a comma-separated list"
+  echo "                                                   # of alternatives (useful when training LSTMs, where the"
+  echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
+  echo "                                                   # sizes).  The first in the list is preferred and is used"
+  echo "                                                   # when working out the number of archives etc."
   echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
   echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
   echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
@@ -198,8 +206,14 @@ else
   feat_dim=$(cat $dir/info/feat_dim) || exit 1;
 fi
 
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
 # the + 1 is to round up, not down... we assume it doesn't divide exactly.
-num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
+num_archives=$[$num_frames/($frames_per_eg_principal*$samples_per_iter)+1]
 if [ $num_archives -eq 1 ]; then
   echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
   echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
@@ -222,7 +236,7 @@ num_archives=$[$archives_multiple*$num_archives_intermediate]
 echo $num_archives >$dir/info/num_archives
 echo $frames_per_eg >$dir/info/frames_per_eg
 # Work out the number of egs per archive
-egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
+egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)]
 ! [ $egs_per_archive -le $samples_per_iter ] && \
   echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
   && exit 1;
@@ -275,17 +289,17 @@ if [ $stage -le 3 ]; then
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
   $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg] ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_diagnostic.egs || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg] ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_diagnostic.egs || touch $dir/.error &
   wait
   sleep 5  # wait for file system to sync.
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 46cf6eb0c20..eeac84db969 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -26,6 +26,9 @@ target_type=sparse  # dense to have dense targets,
 num_targets=        # required for target-type=sparse with raw nnet
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations (more useful when using large chunks, e.g. for BLSTMs);
+                  # the first one (the principal num-frames) is preferred.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
@@ -73,6 +76,11 @@ if [ $# != 3 ]; then
   echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
   echo "                                                   # to use as input to the neural net."
   echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
+  echo "                                                   # May be either a single number or a comma-separated list"
+  echo "                                                   # of alternatives (useful when training LSTMs, where the"
+  echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
+  echo "                                                   # sizes).  The first in the list is preferred and is used"
+  echo "                                                   # when working out the number of archives etc."
   echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
   echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
   echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
@@ -191,8 +199,14 @@ else
   feat_dim=$(cat $dir/info/feat_dim) || exit 1;
 fi
 
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
 # the + 1 is to round up, not down... we assume it doesn't divide exactly.
-num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
+num_archives=$[$num_frames/($frames_per_eg_principal*$samples_per_iter)+1]
 if [ $num_archives -eq 1 ]; then
   echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
   echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
@@ -215,7 +229,7 @@ num_archives=$[$archives_multiple*$num_archives_intermediate]
 echo $num_archives >$dir/info/num_archives
 echo $frames_per_eg >$dir/info/frames_per_eg
 # Work out the number of egs per archive
-egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
+egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)]
 ! [ $egs_per_archive -le $samples_per_iter ] && \
   echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
   && exit 1;
@@ -293,17 +307,17 @@ if [ $stage -le 3 ]; then
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
   $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg] ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_diagnostic.egs || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg] ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg] ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_diagnostic.egs || touch $dir/.error &
   wait
   sleep 5  # wait for file system to sync.
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index ef46d962393..29d93972fac 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -369,11 +369,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.final_effective_lrate)
 
     min_deriv_time = None
-    max_deriv_time = None
+    max_deriv_time_relative = None
     if args.deriv_truncate_margin is not None:
         min_deriv_time = -args.deriv_truncate_margin - model_left_context
-        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
-                          + model_right_context)
+        max_deriv_time_relative = \
+           args.deriv_truncate_margin + model_right_context
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -420,7 +420,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 left_context=left_context,
                 right_context=right_context,
                 min_deriv_time=min_deriv_time,
-                max_deriv_time=max_deriv_time,
+                max_deriv_time_relative=max_deriv_time_relative,
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 93fd7da0dc4..0f9cbc59595 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -364,11 +364,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.final_effective_lrate)
 
     min_deriv_time = None
-    max_deriv_time = None
+    max_deriv_time_relative = None
     if args.deriv_truncate_margin is not None:
         min_deriv_time = -args.deriv_truncate_margin - model_left_context
-        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
-                          + model_right_context)
+        max_deriv_time_relative = \
+           args.deriv_truncate_margin + model_right_context
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -414,7 +414,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 left_context=left_context,
                 right_context=right_context,
                 min_deriv_time=min_deriv_time,
-                max_deriv_time=max_deriv_time,
+                max_deriv_time_relative=max_deriv_time_relative,
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,

From e066c1baac88a3b98ef2953ad18771a435926ec0 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 29 Dec 2016 22:12:17 -0800
Subject: [PATCH 036/213] Modify nnet3 python scripts to accept comma-separated
 alternatives in --egs.chunk-width options; remove deprecated option
 --num-bptt-steps.

---
 egs/ami/s5b/local/nnet3/run_lstm.sh           |  1 -
 egs/hkust/s5/local/nnet3/run_lstm.sh          |  6 +-
 .../nnet3/train/chain_objf/acoustic_model.py  |  6 +-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 82 +++++++++----------
 .../train/frame_level_objf/acoustic_model.py  |  6 +-
 .../nnet3/train/frame_level_objf/common.py    | 14 +---
 .../nnet3/train/frame_level_objf/raw_model.py |  6 +-
 egs/wsj/s5/steps/nnet3/chain/train.py         | 22 ++---
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  6 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  6 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 40 ++++-----
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 39 ++++-----
 12 files changed, 102 insertions(+), 132 deletions(-)

diff --git a/egs/ami/s5b/local/nnet3/run_lstm.sh b/egs/ami/s5b/local/nnet3/run_lstm.sh
index c5583e2d0ef..ef5bfb36259 100755
--- a/egs/ami/s5b/local/nnet3/run_lstm.sh
+++ b/egs/ami/s5b/local/nnet3/run_lstm.sh
@@ -195,7 +195,6 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.shrink-value 0.99 \
     --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
     --trainer.optimization.momentum=$momentum \
-    --trainer.rnn.num-bptt-steps 30 \
     --egs.chunk-width=$chunk_width \
     --egs.chunk-left-context=$chunk_left_context \
     --egs.chunk-right-context=$chunk_right_context \
diff --git a/egs/hkust/s5/local/nnet3/run_lstm.sh b/egs/hkust/s5/local/nnet3/run_lstm.sh
index 7529147c948..f79ad02b543 100755
--- a/egs/hkust/s5/local/nnet3/run_lstm.sh
+++ b/egs/hkust/s5/local/nnet3/run_lstm.sh
@@ -42,7 +42,6 @@ shrink=0.98
 momentum=0.5
 adaptive_shrink=true
 num_chunk_per_minibatch=100
-num_bptt_steps=20
 samples_per_iter=20000
 remove_egs=true
 
@@ -60,8 +59,8 @@ frames_per_chunk=
 . ./utils/parse_options.sh
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -126,7 +125,6 @@ if [ $stage -le 8 ]; then
     --non-recurrent-projection-dim $non_recurrent_projection_dim \
     --chunk-width $chunk_width \
     --chunk-left-context $chunk_left_context \
-    --num-bptt-steps $num_bptt_steps \
     --norm-based-clipping $norm_based_clipping \
     --ng-per-element-scale-options "$ng_per_element_scale_options" \
     --ng-affine-options "$ng_affine_options" \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index d6f6d47b018..5c6345b46f3 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -56,7 +56,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                        frame_subsampling_factor=3,
                        alignment_subsampling_factor=3,
                        feat_type='raw', online_ivector_dir=None,
-                       frames_per_iter=20000, frames_per_eg=20, srand=0,
+                       frames_per_iter=20000, frames_per_eg_str="20", srand=0,
                        egs_opts=None, cmvn_opts=None, transform_dir=None):
     """Wrapper for steps/nnet3/chain/get_egs.sh
 
@@ -77,7 +77,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                 --alignment-subsampling-factor {alignment_subsampling_factor} \
                 --stage {stage} \
                 --frames-per-iter {frames_per_iter} \
-                --frames-per-eg {frames_per_eg} \
+                --frames-per-eg {frames_per_eg_str} \
                 --srand {srand} \
                 {data} {dir} {lat_dir} {egs_dir}""".format(
                     command=run_opts.command,
@@ -99,7 +99,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                     frame_subsampling_factor=frame_subsampling_factor,
                     alignment_subsampling_factor=alignment_subsampling_factor,
                     stage=stage, frames_per_iter=frames_per_iter,
-                    frames_per_eg=frames_per_eg, srand=srand,
+                    frames_per_eg_str=frames_per_eg_str, srand=srand,
                     data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir,
                     egs_opts=egs_opts if egs_opts is not None else ''))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index c6ced36f127..336b204abab 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -17,14 +17,12 @@
 import shutil
 
 import libs.common as common_lib
-import libs.nnet3.train.dropout_schedule as dropout_schedule
-from dropout_schedule import *
 
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
 
-class RunOpts(object):
+class RunOpts:
     """A structure to store run options.
 
     Run options like queue.pl and run.pl, along with their memory
@@ -137,6 +135,32 @@ def get_best_nnet_model(dir, iter, best_model_index, run_opts,
                                       out_model=out_model, scale=scale))
 
 
+def validate_chunk_width(chunk_width):
+    """Validate a chunk-width string , returns boolean.
+    Expected to be a string representing either an integer, like '20',
+    or a comma-separated list of integers like '20,30,16'"""
+    if not isinstance(chunk_width, str):
+        return false
+    a = chunk_width.split(",");
+    if len(a) == 0:
+        return false
+    for elem in a:
+        try:
+            i = int(elem)
+            if i < 1:
+                return false
+        except:
+            return false
+    return true
+
+
+def principal_chunk_width(chunk_width):
+    """Given a chunk-width string like "20" or "50,70,40", returns the principal
+    chunk-width which is the first element, as an int.  E.g. 20, or 40."""
+    if not validate_chunk_width(chunk_width):
+        raise Exception("Invalid chunk-width {0}".format(chunk_width))
+    return int(chunk_width.split(",")[0])
+
 def copy_egs_properties_to_exp_dir(egs_dir, dir):
     try:
         for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
@@ -144,9 +168,8 @@ def copy_egs_properties_to_exp_dir(egs_dir, dir):
             if os.path.isfile(file_name):
                 shutil.copy2(file_name, dir)
     except IOError:
-        logger.error("Error while trying to copy egs "
-                     "property files to {dir}".format(dir=dir))
-        raise
+        raise Exception("Error while trying to copy egs "
+                        "property files to {dir}".format(dir=dir))
 
 
 def parse_generic_config_vars_file(var_file):
@@ -193,17 +216,19 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                 egs_right_context < right_context):
             raise Exception('The egs have insufficient context')
 
-        frames_per_eg = int(open('{0}/info/frames_per_eg'.format(
-                                    egs_dir)).readline())
+        frames_per_eg_str = open('{0}/info/frames_per_eg'.format(
+                             egs_dir)).readline().rstrip()
+        if (!validate_chunk_width(frames_per_eg_str)):
+            raise Exception("Invalid frames_per_eg in directory {0}/info".format(
+                    egs_dir))
         num_archives = int(open('{0}/info/num_archives'.format(
                                     egs_dir)).readline())
 
         return [egs_left_context, egs_right_context,
-                frames_per_eg, num_archives]
-    except (IOError, ValueError):
-        logger.error("The egs dir {0} has missing or "
-                     "malformed files.".format(egs_dir))
-        raise
+                frames_per_eg_str, num_archives]
+    except (IOError, ValueError) as e:
+        raise Exception("The egs dir {0} has missing or "
+                        "malformed files: {1}".format(egs_dir, e.strerr))
 
 
 def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
@@ -366,9 +391,9 @@ def clean_nnet_dir(nnet_dir, num_iters, egs_dir,
             remove_model(nnet_dir, iter, num_iters, None,
                          preserve_model_interval,
                          get_raw_nnet_from_am=get_raw_nnet_from_am)
-    except (IOError, OSError):
-        logger.error("Error while cleaning up the nnet directory")
-        raise
+    except (IOError, OSError) as err:
+        logger.warning("Error while cleaning up the nnet directory")
+        raise err
 
 
 def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
@@ -534,31 +559,6 @@ def __init__(self):
                                  Note: we implemented it in such a way that it
                                  doesn't increase the effective learning
                                  rate.""")
-        self.parser.add_argument("--trainer.dropout-schedule", type=str,
-                                 action=common_lib.NullstrToNoneAction,
-                                 dest='dropout_schedule', default=None,
-                                 help="""Use this to specify the dropout
-                                 schedule.  You specify a piecewise linear
-                                 function on the domain [0,1], where 0 is the
-                                 start and 1 is the end of training; the
-                                 function-argument (x) rises linearly with the
-                                 amount of data you have seen, not iteration
-                                 number (this improves invariance to
-                                 num-jobs-{initial-final}).  E.g. '0,0.2,0'
-                                 means 0 at the start; 0.2 after seeing half
-                                 the data; and 0 at the end.  You may specify
-                                 the x-value of selected points, e.g.
-                                 '0,0.2@0.25,0' means that the 0.2
-                                 dropout-proportion is reached a quarter of the
-                                 way through the data.   The start/end x-values
-                                 are at x=0/x=1, and other unspecified x-values
-                                 are interpolated between known x-values.  You
-                                 may specify different rules for different
-                                 component-name patterns using 'pattern1=func1
-                                 pattern2=func2', e.g. 'relu*=0,0.1,0
-                                 lstm*=0,0.2,0'.  More general should precede
-                                 less general patterns, as they are applied
-                                 sequentially.""")
 
         # General options
         self.parser.add_argument("--stage", type=int, default=-4,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index f3104f93089..ca3d36082fb 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -22,7 +22,7 @@ def generate_egs(data, alidir, egs_dir,
                  left_context, right_context,
                  run_opts, stage=0,
                  feat_type='raw', online_ivector_dir=None,
-                 samples_per_iter=20000, frames_per_eg=20, srand=0,
+                 samples_per_iter=20000, frames_per_eg_str="20", srand=0,
                  egs_opts=None, cmvn_opts=None, transform_dir=None):
 
     """ Wrapper for calling steps/nnet3/get_egs.sh
@@ -41,7 +41,7 @@ def generate_egs(data, alidir, egs_dir,
                 --left-context {left_context} --right-context {right_context} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
-                --frames-per-eg {frames_per_eg} \
+                --frames-per-eg {frames_per_eg_str} \
                 --srand {srand} \
                 {data} {alidir} {egs_dir}
         """.format(command=run_opts.command,
@@ -55,7 +55,7 @@ def generate_egs(data, alidir, egs_dir,
                                 else ''),
                    left_context=left_context, right_context=right_context,
                    stage=stage, samples_per_iter=samples_per_iter,
-                   frames_per_eg=frames_per_eg, srand=srand, data=data,
+                   frames_per_eg_str=frames_per_eg_str, srand=srand, data=data,
                    alidir=alidir, egs_dir=egs_dir,
                    egs_opts=egs_opts if egs_opts is not None else ''))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 1afa532e914..a6f09f8b2ce 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -41,16 +41,12 @@ def train_new_models(dir, iter, srand, num_jobs,
     this is no longer true for RNNs as we use do not use the --frame option
     but we use the same script for consistency with FF-DNN code
 
-    Args:
+    Selected args:
         frames_per_eg: The default value -1 implies chunk_level_training, which
             is particularly applicable to RNN training. If it is > 0, then it
             implies frame-level training, which is applicable for DNN training.
             If it is > 0, then each parallel SGE job created, a different frame
             numbered 0..frames_per_eg-1 is used.
-        min_deriv_time: Applicable for RNN training. A default value of None
-            implies a min_deriv_time of 0 is used. During RNN training, its
-            value is set to chunk_width - num_bptt_steps in the training
-            script.
     """
 
     chunk_level_training = False if frames_per_eg > 0 else True
@@ -147,16 +143,12 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
     network training
 
-    Args:
+    Selected args:
         frames_per_eg: The default value -1 implies chunk_level_training, which
             is particularly applicable to RNN training. If it is > 0, then it
             implies frame-level training, which is applicable for DNN training.
             If it is > 0, then each parallel SGE job created, a different frame
             numbered 0..frames_per_eg-1 is used.
-        min_deriv_time: Applicable for RNN training. A default value of None
-            implies a min_deriv_time of 0 is used. During RNN training, its
-            value is set to chunk_width - num_bptt_steps in the training
-            script.
         shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
             parameter values are scaled by this value.
         get_raw_nnet_from_am: If True, then the network is read and stored as
@@ -480,7 +472,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
 
     if chunk_width is not None:
         # this is an RNN model
-        mbsize = int(1024.0/(chunk_width))
+        mbsize = int(1024.0/(common_train_lib.principal_chunk_width(chunk_width)))
     else:
         mbsize = 1024
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index 0fe8e3d4927..3f6e85e3644 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -22,7 +22,7 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                                run_opts, stage=0,
                                feat_type='raw', online_ivector_dir=None,
                                target_type='dense', num_targets=-1,
-                               samples_per_iter=20000, frames_per_eg=20,
+                               samples_per_iter=20000, frames_per_eg_str="20",
                                srand=0, egs_opts=None, cmvn_opts=None,
                                transform_dir=None):
     """ Wrapper for calling steps/nnet3/get_egs_targets.sh
@@ -57,7 +57,7 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                 --left-context {left_context} --right-context {right_context} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
-                --frames-per-eg {frames_per_eg} \
+                --frames-per-eg {frames_per_eg_str} \
                 --srand {srand} \
                 --target-type {target_type} \
                 --num-targets {num_targets} \
@@ -73,7 +73,7 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                                 else ''),
                    left_context=left_context, right_context=right_context,
                    stage=stage, samples_per_iter=samples_per_iter,
-                   frames_per_eg=frames_per_eg, srand=srand,
+                   frames_per_eg_str=frames_per_eg_str, srand=srand,
                    num_targets=num_targets,
                    data=data,
                    targets_scp=targets_scp, target_type=target_type,
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 18bc3128bcb..7a34df6f587 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -51,11 +51,13 @@ def get_args():
         parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
-    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default=150,
-                        help="""Number of output labels in each example.
-                        Caution: if you double this you should halve
-                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
+                        default="20",
+                        help="""Number of frames per chunk in the examples
+                        used to train the RNN.   Caution: if you double this you
+                        should halve --trainer.samples-per-iter.  May be
+                        a comma-separated list of alternatives: first width
+                        is the 'principal' chunk-width, used preferentially""")
 
     # chain options
     parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
@@ -185,8 +187,8 @@ def process_args(args):
     """ Process the options got from get_args()
     """
 
-    if args.chunk_width < 1:
-        raise Exception("--egs.chunk-width should have a minimum value of 1")
+    if not common_train_lib.validate_chunk_width(args.chunk_width):
+        raise Exception("--egs.chunk-width has an invalid value");
 
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
@@ -327,7 +329,7 @@ def train(args, run_opts, background_process_handler):
             right_tolerance=args.right_tolerance,
             frame_subsampling_factor=args.frame_subsampling_factor,
             alignment_subsampling_factor=args.alignment_subsampling_factor,
-            frames_per_eg=args.chunk_width,
+            frames_per_eg_str=args.chunk_width,
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -342,10 +344,10 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
+     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                                         egs_left_context, egs_right_context))
-    assert(args.chunk_width == frames_per_eg)
+    assert(args.chunk_width == frames_per_eg_str)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
     if (args.num_jobs_final > num_archives_expanded):
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index ad4f8477689..86efa2fdb91 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -210,7 +210,7 @@ def train(args, run_opts, background_process_handler):
             data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
             run_opts=run_opts,
-            frames_per_eg=args.frames_per_eg,
+            frames_per_eg_str=str(args.frames_per_eg),
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -225,10 +225,10 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
+     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                                         left_context, right_context))
-    assert(args.frames_per_eg == frames_per_eg)
+    assert(str(args.frames_per_eg) == frames_per_eg_str)
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 215b6abef59..14385e2cf2c 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -222,7 +222,7 @@ def train(args, run_opts, background_process_handler):
             egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
             run_opts=run_opts,
-            frames_per_eg=args.frames_per_eg,
+            frames_per_eg_str=str(args.frames_per_eg),
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -239,10 +239,10 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
+     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                                         left_context, right_context))
-    assert(args.frames_per_eg == frames_per_eg)
+    assert(str(args.frames_per_eg) == frames_per_eg_str)
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 29d93972fac..cf48b300575 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -58,12 +58,13 @@ def get_args():
         parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
-    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default=20,
-                        help="""Number of output labels in the sequence
-                        used to train an LSTM.
-                        Caution: if you double this you should halve
-                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
+                        default="20",
+                        help="""Number of frames per chunk in the examples
+                        used to train the RNN.   Caution: if you double this you
+                        should halve --trainer.samples-per-iter.  May be
+                        a comma-separated list of alternatives: first width
+                        is the 'principal' chunk-width, used preferentially""")
     parser.add_argument("--egs.chunk-left-context", type=int,
                         dest='chunk_left_context', default=40,
                         help="""Number of left steps used in the estimation of
@@ -122,9 +123,6 @@ def get_args():
                         dest='num_chunk_per_minibatch', default=100,
                         help="Number of sequences to be processed in "
                         "parallel every minibatch")
-    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
-                        dest='num_bptt_steps', default=None,
-                        help="""Deprecated. Kept for back compatibility.""")
     parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                         dest='deriv_truncate_margin', default=8,
                         help="""Margin (in input frames) around the 'required'
@@ -163,8 +161,8 @@ def process_args(args):
     """ Process the options got from get_args()
     """
 
-    if args.chunk_width < 1:
-        raise Exception("--egs.chunk-width should have a minimum value of 1")
+    if not common_train_lib.validate_chunk_width(args.chunk_width):
+        raise Exception("--egs.chunk-width has an invalid value");
 
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
@@ -172,17 +170,6 @@ def process_args(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if args.num_bptt_steps is not None:
-        # -2 is used to compensate for the splicing of the input frame,
-        # assuming that splicing spans from -2 to 2
-        args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
-        logger.warning(
-            "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "
-            "--trainer.deriv-truncate-margin is set to (num-bptt-steps - "
-            "chunk-width - 2) = {0}. We recommend using the option "
-            "--trainer.deriv-truncate-margin.".format(
-                args.deriv_truncate_margin))
-
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -303,7 +290,7 @@ def train(args, run_opts, background_process_handler):
             egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
             run_opts=run_opts,
-            frames_per_eg=args.chunk_width,
+            frames_per_eg_str=args.chunk_width,
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -320,10 +307,13 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
+     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                                         left_context, right_context))
-    assert(args.chunk_width == frames_per_eg)
+    if args.chunk_width != frames_per_eg_str:
+        raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
+                        "in the egs dir {0} vs {1}".(args.chunk_width,
+                                                     frames_per_eg_str))
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 0f9cbc59595..8a61d21f340 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -59,12 +59,13 @@ def get_args():
         parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
-    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default=20,
-                        help="""Number of output labels in the sequence
-                        used to train an LSTM.
-                        Caution: if you double this you should halve
-                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
+                        default="20",
+                        help="""Number of frames per chunk in the examples
+                        used to train the RNN.   Caution: if you double this you
+                        should halve --trainer.samples-per-iter.  May be
+                        a comma-separated list of alternatives: first width
+                        is the 'principal' chunk-width, used preferentially""")
     parser.add_argument("--egs.chunk-left-context", type=int,
                         dest='chunk_left_context', default=40,
                         help="""Number of left steps used in the estimation of
@@ -121,9 +122,6 @@ def get_args():
                         dest='num_chunk_per_minibatch', default=100,
                         help="Number of sequences to be processed in "
                         "parallel every minibatch")
-    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
-                        dest='num_bptt_steps', default=None,
-                        help="""Deprecated. Kept for back compatibility.""")
     parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                         dest='deriv_truncate_margin', default=8,
                         help="""Margin (in input frames) around the 'required'
@@ -159,8 +157,8 @@ def process_args(args):
     """ Process the options got from get_args()
     """
 
-    if args.chunk_width < 1:
-        raise Exception("--egs.chunk-width should have a minimum value of 1")
+    if not common_train_lib.validate_chunk_width(args.chunk_width):
+        raise Exception("--egs.chunk-width has an invalid value");
 
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
@@ -168,17 +166,6 @@ def process_args(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if args.num_bptt_steps is not None:
-        # -2 is used to compensate for the splicing of the input frame,
-        # assuming that splicing spans from -2 to 2
-        args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
-        logger.warning(
-            "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "
-            "--trainer.deriv-truncate-margin is set to (num-bptt-steps - "
-            "chunk-width - 2) = {0}. We recommend using the option "
-            "--trainer.deriv-truncate-margin.".format(
-                args.deriv_truncate_margin))
-
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -289,7 +276,7 @@ def train(args, run_opts, background_process_handler):
             data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
             run_opts=run_opts,
-            frames_per_eg=args.chunk_width,
+            frames_per_eg_str=args.chunk_width,
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -304,10 +291,12 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
+     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                                         left_context, right_context))
-    assert(args.chunk_width == frames_per_eg)
+    if args.chunk_width != frames_per_eg_str:
+        raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
+                        "in the egs dir {0} vs {1}".(args.chunk_width, frames_per_eg_str))
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '

From 12e4eecfba6cbefa2685942ac500a0e2a408ec8a Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 29 Dec 2016 23:45:58 -0800
Subject: [PATCH 037/213] Modifying nnet3 scripts to accept more general form
 of minibatch-size strings (rules, not just ints.)

---
 .../nnet3/train/chain_objf/acoustic_model.py  |  17 +--
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 109 ++++++++++++++++--
 .../nnet3/train/frame_level_objf/common.py    |  36 +++---
 egs/wsj/s5/steps/nnet3/chain/train.py         |  19 ++-
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  28 +++--
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  28 +++--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  30 +++--
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  30 +++--
 8 files changed, 216 insertions(+), 81 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 5c6345b46f3..f55a647f5e0 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -111,7 +111,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      min_deriv_time, max_deriv_time_relative,
                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
                      momentum, max_param_change,
-                     shuffle_buffer_size, num_chunk_per_minibatch,
+                     shuffle_buffer_size, num_chunk_per_minibatch_str,
                      frame_subsampling_factor, truncate_deriv_weights,
                      cache_io_opts, run_opts):
     """
@@ -184,7 +184,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         egs_dir=egs_dir, archive_index=archive_index,
                         buf_size=shuffle_buffer_size,
                         cache_io_opts=cur_cache_io_opts,
-                        num_chunk_per_mb=num_chunk_per_minibatch),
+                        num_chunk_per_mb=num_chunk_per_minibatch_str),
             wait=False)
 
         processes.append(process_handle)
@@ -205,7 +205,7 @@ def train_new_models(dir, iter, srand, num_jobs,
 def train_one_iteration(dir, iter, srand, egs_dir,
                         num_jobs, num_archives_processed, num_archives,
                         learning_rate, shrinkage_value,
-                        num_chunk_per_minibatch,
+                        num_chunk_per_minibatch_str,
                         num_hidden_layers, add_layers_period,
                         left_context, right_context,
                         apply_deriv_weights, min_deriv_time,
@@ -285,7 +285,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                                                                  iter=iter)
 
     if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+        cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str
         cur_max_param_change = max_param_change
     else:
         # on iteration zero or when we just added a layer, use a smaller
@@ -293,7 +293,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         # the jobs): the model-averaging isn't always helpful when the model is
         # changing too fast (i.e. it can worsen the objective function), and
         # the smaller minibatch size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+        cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str(
+            cur_num_chunk_per_minibatch_str)
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
     raw_model_string = raw_model_string + dropout_edit_string
@@ -322,7 +323,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      momentum=momentum,
                      max_param_change=cur_max_param_change,
                      shuffle_buffer_size=shuffle_buffer_size,
-                     num_chunk_per_minibatch=cur_num_chunk_per_minibatch,
+                     num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
                      frame_subsampling_factor=frame_subsampling_factor,
                      truncate_deriv_weights=truncate_deriv_weights,
                      cache_io_opts=cache_io_opts, run_opts=run_opts)
@@ -510,7 +511,7 @@ def compute_progress(dir, iter, run_opts, wait=False,
         background_process_handler=background_process_handler)
 
 
-def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
+def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, left_context, right_context,
                    leaky_hmm_coefficient, l2_regularize,
                    xent_regularize, run_opts, background_process_handler=None):
@@ -552,7 +553,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
                     lc=left_context, rc=right_context,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
-                    num_chunk_per_mb=num_chunk_per_minibatch,
+                    num_chunk_per_mb=num_chunk_per_minibatch_str,
                     num_iters=num_iters,
                     egs_dir=egs_dir))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 336b204abab..b064f517fc0 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -140,18 +140,17 @@ def validate_chunk_width(chunk_width):
     Expected to be a string representing either an integer, like '20',
     or a comma-separated list of integers like '20,30,16'"""
     if not isinstance(chunk_width, str):
-        return false
+        return False
     a = chunk_width.split(",");
-    if len(a) == 0:
-        return false
+    assert len(a) != 0  # would be code error
     for elem in a:
         try:
             i = int(elem)
             if i < 1:
-                return false
+                return False
         except:
-            return false
-    return true
+            return False
+    return True
 
 
 def principal_chunk_width(chunk_width):
@@ -161,6 +160,91 @@ def principal_chunk_width(chunk_width):
         raise Exception("Invalid chunk-width {0}".format(chunk_width))
     return int(chunk_width.split(",")[0])
 
+
+def validate_minibatch_size_str(minibatch_size_str):
+    """Validate a minibatch-size string (returns bool).
+    A minibatch-size string might either be an integer, like '256'
+    or a rule like '128=64-128/256=32,64', whose format
+    is: eg-length1=size-range1/eg-length2=size-range2/....
+    where the size-range is a comma-separated list of either integers
+    or ranges.  An arbitrary eg will be mapped to the size-range
+    for the closest of the listed eg-lengths (the eg-length is defined
+    as the number of input frames, including context frames)."""
+    if not isinstance(minibatch_size_str, str):
+        return False
+    a = minibatch_size_str.split("/")
+    assert len(a) != 0  # would be code error
+
+    for elem in a:
+        b = elem.split('=')
+        # We expect b to have length 2 in the normal case.
+        if len(b) != 2:
+            # one-element 'b' is OK if len(a) is 1 (so there is only
+            # one choice)... this would mean somebody just gave "25"
+            # or something like that for the minibatch size.
+            if len(a) == 1 and len(b) == 1:
+                try:
+                    mb_size = int(b[0])
+                    return mb_size > 0
+                except:
+                    return False
+            else:
+                return False
+        # check that the thing before the '=' sign is a positive integer
+        try:
+            i = b[0]
+            if i <= 0:
+                return False
+        except:
+            return False  # not an integer at all.
+        # check the thing after the '=' sign is a comma-separated list of ranges
+        ranges = b[1].split(",")
+        assert len(ranges) > 0
+        for range in ranges:
+            # a range may be either e.g. '64', or '128-256'
+            try:
+                c = [ int(x) for x in range.split("-") ]
+            except:
+                return False
+            if len(c) == 1:
+                if c[0] <= 0:
+                    return False
+            elif len(c) == 2:
+                if c[0] <= 0 or c[1] < c[0]:
+                    return False
+            else:
+                return False
+    return True
+
+
+def halve_minibatch_size_str(minibatch_size_str):
+    """Halve a minibatch-size string, as would be validated by
+    validate_minibatch_size_str (see docs for that).  This halves
+    all the integer elements of minibatch_size_str that represent minibatch
+    sizes (as opposed to chunk-lengths) and that are >1."""
+
+    if not validate_minibatch_size_str(minibatch_size_str):
+        raise Exception("Invalid minibatch-size string '{0}'".format(minibatch_size_str))
+
+    a = minibatch_size_str.split("/")
+    ans = []
+    for elem in a:
+        b = elem.split('=')
+        # We expect b to have length 2 in the normal case.
+        if len(b) == 1:
+            mb_size = int(b[0])
+            ans.append(str(max(1, mb_size / 2)))
+        else:
+            assert len(b) == 2
+            ranges_out = []
+            ranges = b[1].split(',')
+            for range in ranges:
+                c = [ str(max(1, int(x)/2)) for x in range.split('-') ]
+                ranges_out.append('-'.join(c))
+            ans.append('{0}={1}'.format(b[0], ','.join(ranges_out)))
+    return '/'.join(ans)
+
+
 def copy_egs_properties_to_exp_dir(egs_dir, dir):
     try:
         for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
@@ -218,7 +302,7 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
 
         frames_per_eg_str = open('{0}/info/frames_per_eg'.format(
                              egs_dir)).readline().rstrip()
-        if (!validate_chunk_width(frames_per_eg_str)):
+        if not validate_chunk_width(frames_per_eg_str):
             raise Exception("Invalid frames_per_eg in directory {0}/info".format(
                     egs_dir))
         num_archives = int(open('{0}/info/num_archives'.format(
@@ -412,6 +496,13 @@ def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
         os.remove(file_name)
 
 
+def self_test():
+    assert halve_minibatch_size_str('64') == '32'
+    assert halve_minibatch_size_str('1') == '1'
+    assert halve_minibatch_size_str('128=64/256=40,80-100') == '128=32/256=20,40-50'
+    assert validate_chunk_width('64')
+    assert validate_chunk_width('64,25,128')
+
 class CommonParser:
     """Parser for parsing common options related to nnet3 training.
 
@@ -622,3 +713,7 @@ def __init__(self):
                                  help="""Polling frequency in seconds at which
                                  the background process handler checks for
                                  errors in the processes.""")
+
+
+if __name__ == '__main__':
+    self_test()
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index a6f09f8b2ce..c1c95b4748f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -27,7 +27,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      raw_model_string, egs_dir,
                      left_context, right_context,
                      momentum, max_param_change,
-                     shuffle_buffer_size, minibatch_size,
+                     shuffle_buffer_size, minibatch_size_str,
                      cache_read_opt, run_opts,
                      frames_per_eg=-1,
                      min_deriv_time=None, max_deriv_time_relative=None):
@@ -91,7 +91,7 @@ def train_new_models(dir, iter, srand, num_jobs,
             """ark:{egs_dir}/egs.{archive_index}.ark ark:- |"""
             """nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} """
             """--srand={srand} ark:- ark:- | """
-            """nnet3-merge-egs --minibatch-size={minibatch_size} """
+            """nnet3-merge-egs --minibatch-size={minibatch_size_str} """
             """--measure-output-frames=false """
             """--discard-partial-minibatches=true ark:- ark:- |" \
                     {dir}/{next_iter}.{job}.raw""".format(
@@ -111,7 +111,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         raw_model=raw_model_string, context_opts=context_opts,
                         egs_dir=egs_dir, archive_index=archive_index,
                         shuffle_buffer_size=shuffle_buffer_size,
-                        minibatch_size=minibatch_size), wait=False)
+                        minibatch_size_str=minibatch_size_str), wait=False)
 
         processes.append(process_handle)
 
@@ -130,12 +130,12 @@ def train_new_models(dir, iter, srand, num_jobs,
 
 def train_one_iteration(dir, iter, srand, egs_dir,
                         num_jobs, num_archives_processed, num_archives,
-                        learning_rate, minibatch_size,
+                        learning_rate, minibatch_size_str,
                         num_hidden_layers, add_layers_period,
                         left_context, right_context,
                         momentum, max_param_change, shuffle_buffer_size,
                         run_opts,
-                        cv_minibatch_size=256, frames_per_eg=-1,
+                        cv_minibatch_size_str='256', frames_per_eg=-1,
                         min_deriv_time=None, max_deriv_time_relative=None,
                         shrinkage_value=1.0,
                         get_raw_nnet_from_am=True,
@@ -182,7 +182,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         dir=dir, iter=iter, egs_dir=egs_dir,
         left_context=left_context, right_context=right_context,
         run_opts=run_opts,
-        mb_size=cv_minibatch_size,
+        minibatch_size_str=cv_minibatch_size_str,
         get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False,
         background_process_handler=background_process_handler)
 
@@ -192,7 +192,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                          left_context=left_context,
                          right_context=right_context,
                          run_opts=run_opts,
-                         mb_size=cv_minibatch_size, wait=False,
+                         minibatch_size_str=cv_minibatch_size_str, wait=False,
                          get_raw_nnet_from_am=get_raw_nnet_from_am,
                          background_process_handler=background_process_handler)
 
@@ -241,7 +241,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                                     lr=learning_rate, dir=dir, iter=iter))
 
     if do_average:
-        cur_minibatch_size = minibatch_size
+        cur_minibatch_size_str = minibatch_size_str
         cur_max_param_change = max_param_change
     else:
         # on iteration zero or when we just added a layer, use a smaller
@@ -249,7 +249,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         # the jobs): the model-averaging isn't always helpful when the model is
         # changing too fast (i.e. it can worsen the objective function), and
         # the smaller minibatch size will help to keep the update stable.
-        cur_minibatch_size = minibatch_size / 2
+        cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str)
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
     try:
@@ -264,7 +264,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      left_context=left_context, right_context=right_context,
                      momentum=momentum, max_param_change=cur_max_param_change,
                      shuffle_buffer_size=shuffle_buffer_size,
-                     minibatch_size=cur_minibatch_size,
+                     minibatch_size_str=cur_minibatch_size_str,
                      cache_read_opt=cache_read_opt, run_opts=run_opts,
                      frames_per_eg=frames_per_eg,
                      min_deriv_time=min_deriv_time,
@@ -365,7 +365,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
 
 
 def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
-                                   right_context, run_opts, mb_size=256,
+                                   right_context, run_opts, minibatch_size_str='256',
                                    wait=False, background_process_handler=None,
                                    get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
@@ -382,12 +382,12 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/valid_diagnostic.egs ark:- | \
-                    nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                    nnet3-merge-egs --minibatch-size={minibatch_size_str} ark:- \
                     ark:- |" """.format(command=run_opts.command,
                                         dir=dir,
                                         iter=iter,
                                         context_opts=context_opts,
-                                        mb_size=mb_size,
+                                        minibatch_size_str=minibatch_size_str,
                                         model=model,
                                         egs_dir=egs_dir),
         wait=wait, background_process_handler=background_process_handler)
@@ -397,19 +397,19 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/train_diagnostic.egs ark:- | \
-                    nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                    nnet3-merge-egs --minibatch-size={minibatch_size_str} ark:- \
                     ark:- |" """.format(command=run_opts.command,
                                         dir=dir,
                                         iter=iter,
                                         context_opts=context_opts,
-                                        mb_size=mb_size,
+                                        minibatch_size_str=minibatch_size_str,
                                         model=model,
                                         egs_dir=egs_dir),
         wait=wait, background_process_handler=background_process_handler)
 
 
 def compute_progress(dir, iter, egs_dir, left_context, right_context,
-                     run_opts, mb_size=256,
+                     run_opts, minibatch_size_str=256,
                      background_process_handler=None, wait=False,
                      get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
@@ -429,13 +429,13 @@ def compute_progress(dir, iter, egs_dir, left_context, right_context,
                     nnet3-show-progress --use-gpu=no "{prev_model}" "{model}" \
                     "ark,bg:nnet3-copy-egs {context_opts} \
                         ark:{egs_dir}/train_diagnostic.egs ark:- | \
-                        nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                        nnet3-merge-egs --minibatch-size={minibatch_size_str} ark:- \
                         ark:- |" """.format(command=run_opts.command,
                                             dir=dir,
                                             iter=iter,
                                             model=model,
                                             context_opts=context_opts,
-                                            mb_size=mb_size,
+                                            minibatch_size_str=minibatch_size_str,
                                             prev_model=prev_model,
                                             egs_dir=egs_dir),
             wait=wait, background_process_handler=background_process_handler)
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 7a34df6f587..c25c484fb8d 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -115,10 +115,14 @@ def get_args():
                         [input] frames per job.  This option is passed to
                         get_egs.sh.  Aim for about a minute of training
                         time""")
-    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int,
-                        dest='num_chunk_per_minibatch', default=512,
-                        help="Number of sequences to be processed in parallel "
-                        "every minibatch")
+
+    parser.add_argument("--trainer.num-chunk-per-minibatch", type=str,
+                        dest='num_chunk_per_minibatch', default='128',
+                        help="""Number of sequences to be processed in
+                        parallel every minibatch.  May be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.initial-effective-lrate",
@@ -190,6 +194,9 @@ def process_args(args):
     if not common_train_lib.validate_chunk_width(args.chunk_width):
         raise Exception("--egs.chunk-width has an invalid value");
 
+    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
+        raise Exception("--trainer.num-chunk-per-minibatch has an invalid value");
+
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
 
@@ -440,7 +447,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
                 shrinkage_value=shrinkage_value,
-                num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+                num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 left_context=left_context,
@@ -485,7 +492,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
         chain_lib.combine_models(
             dir=args.dir, num_iters=num_iters,
             models_to_combine=models_to_combine,
-            num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+            num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
             egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
             leaky_hmm_coefficient=args.leaky_hmm_coefficient,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 86efa2fdb91..3270d114503 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -26,7 +26,7 @@
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                               "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
@@ -66,9 +66,12 @@ def get_args():
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size",
-                        type=float, dest='minibatch_size', default=512,
-                        help="Size of the minibatch used to compute the "
-                        "gradient")
+                        type=str, dest='minibatch_size', default='512',
+                        help="""Size of the minibatch used in SGD training
+                        (argument to nnet3-merge-egs); may be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
 
     # General options
     parser.add_argument("--feat-dir", type=str, required=True,
@@ -100,6 +103,9 @@ def process_args(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
+    if not common_train_lib.validate_minibatch_size_str(args.minibatch_size):
+        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
+
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -297,6 +303,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                * float(iter) / num_iters)
 
         if args.stage <= iter:
+            logger.info("On iteration {0}, learning rate is {1}.".format(
+                iter, learning_rate(iter, current_num_jobs,
+                                    num_archives_processed)))
+
             train_lib.common.train_one_iteration(
                 dir=args.dir,
                 iter=iter,
@@ -307,11 +317,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 num_archives=num_archives,
                 learning_rate=learning_rate(iter, current_num_jobs,
                                             num_archives_processed),
-                dropout_edit_string=common_train_lib.get_dropout_edit_string(
-                    args.dropout_schedule,
-                    float(num_archives_processed) / num_archives_to_process,
-                    iter),
-                minibatch_size=args.minibatch_size,
+                minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
@@ -335,7 +341,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 if iter % reporting_iter_interval == 0:
                     # lets do some reporting
                     [report, times, data] = (
-                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
                     subject = ("Update : Expt {dir} : "
                                "Iter {iter}".format(dir=args.dir, iter=iter))
@@ -384,7 +390,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs=remove_egs)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
         common_lib.send_mail(report, "Update : Expt {0} : "
                                      "complete".format(args.dir), args.email)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 14385e2cf2c..b853d77cb27 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -26,7 +26,7 @@
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                               "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
@@ -65,9 +65,12 @@ def get_args():
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size",
-                        type=float, dest='minibatch_size', default=512,
-                        help="Size of the minibatch used to compute the "
-                        "gradient")
+                        type=str, dest='minibatch_size', default='512',
+                        help="""Size of the minibatch used in SGD training
+                        (argument to nnet3-merge-egs); may be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
@@ -102,6 +105,9 @@ def process_args(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
+    if not common_train_lib.validate_minibatch_size_str(args.minibatch_size):
+        raise Exception("--trainer.optimization.minibatch-size has an invalid value");
+
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -300,6 +306,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                * float(iter) / num_iters)
 
         if args.stage <= iter:
+            logger.info("On iteration {0}, learning rate is {1}.".format(
+                iter, learning_rate(iter, current_num_jobs,
+                                    num_archives_processed)))
+
             train_lib.common.train_one_iteration(
                 dir=args.dir,
                 iter=iter,
@@ -310,11 +320,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 num_archives=num_archives,
                 learning_rate=learning_rate(iter, current_num_jobs,
                                             num_archives_processed),
-                dropout_edit_string=common_train_lib.get_dropout_edit_string(
-                    args.dropout_schedule,
-                    float(num_archives_processed) / num_archives_to_process,
-                    iter),
-                minibatch_size=args.minibatch_size,
+                minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
@@ -340,7 +346,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 if iter % reporting_iter_interval == 0:
                     # lets do some reporting
                     [report, times, data] = (
-                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
                     subject = ("Update : Expt {dir} : "
                                "Iter {iter}".format(dir=args.dir, iter=iter))
@@ -384,7 +390,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             get_raw_nnet_from_am=False)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
         common_lib.send_mail(report, "Update : Expt {0} : "
                                      "complete".format(args.dir), args.email)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index cf48b300575..e65be7a443a 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -112,17 +112,21 @@ def get_args():
                         steps/nnet3/get_saturation.pl) exceeds this threshold
                         we scale the parameter matrices with the
                         shrink-value.""")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
-                        dest='cv_minibatch_size', default=256,
+    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=str,
+                        dest='cv_minibatch_size', default='256',
                         help="""Size of the minibatch to be used in diagnostic
                         jobs (use smaller value for BLSTMs to control memory
-                        usage)""")
-
+                        usage).  May be a more general rule as accepted by the
+                        --minibatch-size option of nnet3-merge-egs; run that
+                        program without args to see the format.""")
     # RNN specific trainer options
-    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int,
-                        dest='num_chunk_per_minibatch', default=100,
-                        help="Number of sequences to be processed in "
-                        "parallel every minibatch")
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
+                        dest='num_chunk_per_minibatch', default='100',
+                        help="""Number of sequences to be processed in
+                        parallel every minibatch.  May be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
     parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                         dest='deriv_truncate_margin', default=8,
                         help="""Margin (in input frames) around the 'required'
@@ -164,6 +168,12 @@ def process_args(args):
     if not common_train_lib.validate_chunk_width(args.chunk_width):
         raise Exception("--egs.chunk-width has an invalid value");
 
+    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
+        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
+
+    if not common_train_lib.validate_minibatch_size_str(args.cv_minibatch_size):
+        raise Exception("--trainer.optimization.cv-minibatch-size has an invalid value");
+
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
 
@@ -404,7 +414,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
                 shrinkage_value=shrinkage_value,
-                minibatch_size=args.num_chunk_per_minibatch,
+                minibatch_size_str=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 left_context=left_context,
@@ -414,7 +424,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
-                cv_minibatch_size=args.cv_minibatch_size,
+                cv_minibatch_size_str=args.cv_minibatch_size,
                 run_opts=run_opts,
                 get_raw_nnet_from_am=False,
                 background_process_handler=background_process_handler)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 8a61d21f340..27ab8c68b14 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -111,17 +111,21 @@ def get_args():
                         steps/nnet3/get_saturation.pl) exceeds this threshold
                         we scale the parameter matrices with the
                         shrink-value.""")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
-                        dest='cv_minibatch_size', default=256,
+    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=str,
+                        dest='cv_minibatch_size', default='256',
                         help="""Size of the minibatch to be used in diagnostic
                         jobs (use smaller value for BLSTMs to control memory
-                        usage)""")
-
+                        usage).  May be a more general rule as accepted by the
+                        --minibatch-size option of nnet3-merge-egs; run that
+                        program without args to see the format.""")
     # RNN specific trainer options
-    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int,
-                        dest='num_chunk_per_minibatch', default=100,
-                        help="Number of sequences to be processed in "
-                        "parallel every minibatch")
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
+                        dest='num_chunk_per_minibatch', default='100',
+                        help="""Number of sequences to be processed in
+                        parallel every minibatch.  May be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
     parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                         dest='deriv_truncate_margin', default=8,
                         help="""Margin (in input frames) around the 'required'
@@ -160,6 +164,12 @@ def process_args(args):
     if not common_train_lib.validate_chunk_width(args.chunk_width):
         raise Exception("--egs.chunk-width has an invalid value");
 
+    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
+        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
+
+    if not common_train_lib.validate_minibatch_size_str(args.cv_minibatch_size):
+        raise Exception("--trainer.optimization.cv-minibatch-size has an invalid value");
+
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
 
@@ -397,7 +407,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
                 shrinkage_value=shrinkage_value,
-                minibatch_size=args.num_chunk_per_minibatch,
+                minibatch_size_str=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 left_context=left_context,
@@ -407,7 +417,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
-                cv_minibatch_size=args.cv_minibatch_size,
+                cv_minibatch_size_str=args.cv_minibatch_size,
                 run_opts=run_opts,
                 background_process_handler=background_process_handler)
 

From 72cb32a592709d4f12a02ae4f6149d4bd0eb9d61 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Fri, 30 Dec 2016 13:54:38 -0800
Subject: [PATCH 038/213] Reverting a couple previous changes to local scripts
 which turned out to be unnecessary

---
 egs/ami/s5b/local/nnet3/run_lstm.sh  | 1 +
 egs/hkust/s5/local/nnet3/run_lstm.sh | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/egs/ami/s5b/local/nnet3/run_lstm.sh b/egs/ami/s5b/local/nnet3/run_lstm.sh
index ef5bfb36259..c5583e2d0ef 100755
--- a/egs/ami/s5b/local/nnet3/run_lstm.sh
+++ b/egs/ami/s5b/local/nnet3/run_lstm.sh
@@ -195,6 +195,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.shrink-value 0.99 \
     --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
     --trainer.optimization.momentum=$momentum \
+    --trainer.rnn.num-bptt-steps 30 \
     --egs.chunk-width=$chunk_width \
     --egs.chunk-left-context=$chunk_left_context \
     --egs.chunk-right-context=$chunk_right_context \
diff --git a/egs/hkust/s5/local/nnet3/run_lstm.sh b/egs/hkust/s5/local/nnet3/run_lstm.sh
index f79ad02b543..7529147c948 100755
--- a/egs/hkust/s5/local/nnet3/run_lstm.sh
+++ b/egs/hkust/s5/local/nnet3/run_lstm.sh
@@ -42,6 +42,7 @@ shrink=0.98
 momentum=0.5
 adaptive_shrink=true
 num_chunk_per_minibatch=100
+num_bptt_steps=20
 samples_per_iter=20000
 remove_egs=true
 
@@ -59,8 +60,8 @@ frames_per_chunk=
 . ./utils/parse_options.sh
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+  cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -125,6 +126,7 @@ if [ $stage -le 8 ]; then
     --non-recurrent-projection-dim $non_recurrent_projection_dim \
     --chunk-width $chunk_width \
     --chunk-left-context $chunk_left_context \
+    --num-bptt-steps $num_bptt_steps \
     --norm-based-clipping $norm_based_clipping \
     --ng-per-element-scale-options "$ng_per_element_scale_options" \
     --ng-affine-options "$ng_affine_options" \

From 9f5dd1e5d17c6fed4dcc04d1da2f25403f76b145 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Fri, 30 Dec 2016 17:09:37 -0800
Subject: [PATCH 039/213] Change nnet3 python scripts to support
 {left-right}-context-{initial,final} options

---
 .../nnet3/train/chain_objf/acoustic_model.py  | 11 ++-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 67 ++++++++++++++++---
 .../train/frame_level_objf/acoustic_model.py  | 11 ++-
 .../nnet3/train/frame_level_objf/raw_model.py | 11 ++-
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh       | 17 ++++-
 egs/wsj/s5/steps/nnet3/chain/train.py         | 18 ++++-
 egs/wsj/s5/steps/nnet3/get_egs.sh             | 17 ++++-
 .../s5/steps/nnet3/get_egs_discriminative.sh  | 24 +++++++
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh     | 15 ++++-
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py   |  2 +
 egs/wsj/s5/steps/nnet3/make_jesus_configs.py  |  2 +
 egs/wsj/s5/steps/nnet3/make_tdnn_configs.py   |  3 +-
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py   |  2 +
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  9 ++-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  6 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 16 +++--
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 22 +++---
 17 files changed, 203 insertions(+), 50 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index f55a647f5e0..f2510ed8e18 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -53,6 +53,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                        left_context, right_context,
                        run_opts, stage=0,
                        left_tolerance=None, right_tolerance=None,
+                       left_context_initial=-1, right_context_final=-1,
                        frame_subsampling_factor=3,
                        alignment_subsampling_factor=3,
                        feat_type='raw', online_ivector_dir=None,
@@ -70,7 +71,10 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                 --feat-type {feat_type} \
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
-                --left-context {left_context} --right-context {right_context} \
+                --left-context {left_context} \
+                --right-context {right_context} \
+                --left-context-initial {left_context_initial} \
+                --right-context-final {right_context_final} \
                 --left-tolerance '{left_tolerance}' \
                 --right-tolerance '{right_tolerance}' \
                 --frame-subsampling-factor {frame_subsampling_factor} \
@@ -89,7 +93,10 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                     ivector_dir=(online_ivector_dir
                                  if online_ivector_dir is not None
                                  else ''),
-                    left_context=left_context, right_context=right_context,
+                    left_context=left_context,
+                    right_context=right_context,
+                    left_context_initial=left_context_initial,
+                    right_context_final=right_context_final,
                     left_tolerance=(left_tolerance
                                     if left_tolerance is not None
                                     else ''),
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index b064f517fc0..60bd2e69d5a 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -281,7 +281,8 @@ def parse_generic_config_vars_file(var_file):
 
 
 def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
-                   left_context, right_context):
+                   left_context, right_context,
+                   left_context_initial=-1, right_context_final=-1):
     try:
         egs_feat_dim = int(open('{0}/info/feat_dim'.format(
                                     egs_dir)).readline())
@@ -291,6 +292,17 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                                     egs_dir)).readline())
         egs_right_context = int(open('{0}/info/right_context'.format(
                                     egs_dir)).readline())
+        try:
+            egs_left_context_initial = int(open('{0}/info/left_context_initial'.format(
+                        egs_dir)).readline())
+        except:  # older scripts didn't write this, treat it as -1 in that case.
+            egs_left_context_initial = -1
+        try:
+            egs_right_context_final = int(open('{0}/info/right_context_final'.format(
+                        egs_dir)).readline())
+        except:  # older scripts didn't write this, treat it as -1 in that case.
+            egs_right_context_final = -1
+
         if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
             raise Exception("There is mismatch between featdim/ivector_dim of "
                             "the current experiment and the provided "
@@ -298,7 +310,26 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
 
         if (egs_left_context < left_context or
                 egs_right_context < right_context):
-            raise Exception('The egs have insufficient context')
+            raise Exception('The egs have insufficient (l,r) context ({0},{1}) '
+                            'versus expected ({2},{3})'.format(
+                    egs_left_context, egs_right_context,
+                    left_context, right_context))
+
+        # the condition on the initial/final context is an equality condition,
+        # not an inequality condition, as there is no mechanism to 'correct' the
+        # context (by subtracting context) while copying the egs, like there is
+        # for the regular left-right context.  If the user is determined to use
+        # previously dumped egs, they may be able to slightly adjust the
+        # --egs.chunk-left-context-initial and --egs.chunk-right-context-final
+        # options to make things matched up.  [note: the model l/r context gets
+        # added in, so you have to correct for changes in that.]
+        if (egs_left_context_initial != left_context_initial or
+            egs_right_context_final != right_context_final):
+            raise Exception('The egs have incorrect initial/final (l,r) context '
+                            '({0},{1}) versus expected ({2},{3}).  See code from '
+                            'where this exception was raised for more info'.format(
+                    egs_left_context_initial, egs_right_context_final,
+                    left_context_initial, right_context_final))
 
         frames_per_eg_str = open('{0}/info/frames_per_eg'.format(
                              egs_dir)).readline().rstrip()
@@ -512,9 +543,10 @@ class CommonParser:
     in steps/nnet3/train*.py and steps/nnet3/chain/train.py
     """
 
-    parser = argparse.ArgumentParser(add_help=False)
+    parser = argparse.ArgumentParser(add_help=False,
+                                     default_chunk_left_context=0)
 
-    def __init__(self):
+    def __init__(self, include_chunk_context = True):
         # feat options
         self.parser.add_argument("--feat.online-ivector-dir", type=str,
                                  dest='online_ivector_dir', default=None,
@@ -527,22 +559,39 @@ def __init__(self):
                                  help="A string specifying '--norm-means' "
                                  "and '--norm-vars' values")
 
-        # egs extraction options
-        self.parser.add_argument("--egs.chunk-left-context", type=int,
-                                 dest='chunk_left_context', default=0,
-                                 help="""Number of additional frames of input
+        # egs extraction options.  there is no point adding the chunk context
+        # option for non-RNNs (by which we mean basic TDNN-type topologies), as
+        # it wouldn't affect anything, so we disable them if we know in advance
+        # that we're not supporting RNN-type topologies (as in train_dnn.py).
+        if include_chunk_context:
+            self.parser.add_argument("--egs.chunk-left-context", type=int,
+                                     dest='chunk_left_context',
+                                     default=default_chunk_left_context,
+                                     help="""Number of additional frames of input
                                  to the left of the input chunk. This extra
                                  context will be used in the estimation of RNN
                                  state before prediction of the first label. In
                                  the case of FF-DNN this extra context will be
                                  used to allow for frame-shifts""")
-        self.parser.add_argument("--egs.chunk-right-context", type=int,
+            self.parser.add_argument("--egs.chunk-right-context", type=int,
                                  dest='chunk_right_context', default=0,
                                  help="""Number of additional frames of input
                                  to the right of the input chunk. This extra
                                  context will be used in the estimation of
                                  bidirectional RNN state before prediction of
                                  the first label.""")
+            self.parser.add_argument("--egs.chunk-left-context-initial", type=int,
+                                     dest='chunk_left_context_initial', default=-1,
+                                     help="""Number of additional frames of input
+                                 to the left of the *first* input chunk extracted
+                                 from an utterance.  If negative, defaults to
+                                 the same as --egs.chunk-left-context""")
+            self.parser.add_argument("--egs.chunk-right-context-final", type=int,
+                                     dest='chunk_right_context_final', default=-1,
+                                     help="""Number of additional frames of input
+                                 to the right of the *last* input chunk extracted
+                                 from an utterance.  If negative, defaults to the
+                                 same as --egs.chunk-right-context""")
         self.parser.add_argument("--egs.transform_dir", type=str,
                                  dest='transform_dir', default=None,
                                  action=common_lib.NullstrToNoneAction,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index ca3d36082fb..47265a19dba 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -21,6 +21,7 @@
 def generate_egs(data, alidir, egs_dir,
                  left_context, right_context,
                  run_opts, stage=0,
+                 left_context_initial=-1, right_context_final=-1,
                  feat_type='raw', online_ivector_dir=None,
                  samples_per_iter=20000, frames_per_eg_str="20", srand=0,
                  egs_opts=None, cmvn_opts=None, transform_dir=None):
@@ -38,7 +39,10 @@ def generate_egs(data, alidir, egs_dir,
                 --feat-type {feat_type} \
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
-                --left-context {left_context} --right-context {right_context} \
+                --left-context {left_context} \
+                --right-context {right_context} \
+                --left-context-initial {left_context_initial} \
+                --right-context-final {right_context_final} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
                 --frames-per-eg {frames_per_eg_str} \
@@ -53,7 +57,10 @@ def generate_egs(data, alidir, egs_dir,
                    ivector_dir=(online_ivector_dir
                                 if online_ivector_dir is not None
                                 else ''),
-                   left_context=left_context, right_context=right_context,
+                   left_context=left_context,
+                   right_context=right_context,
+                   left_context_initial=left_context_initial,
+                   right_context_final=right_context_final,
                    stage=stage, samples_per_iter=samples_per_iter,
                    frames_per_eg_str=frames_per_eg_str, srand=srand, data=data,
                    alidir=alidir, egs_dir=egs_dir,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index 3f6e85e3644..037abeb1dd8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -20,6 +20,7 @@
 def generate_egs_using_targets(data, targets_scp, egs_dir,
                                left_context, right_context,
                                run_opts, stage=0,
+                               left_context_initial=-1, right_context_final=-1,
                                feat_type='raw', online_ivector_dir=None,
                                target_type='dense', num_targets=-1,
                                samples_per_iter=20000, frames_per_eg_str="20",
@@ -54,7 +55,10 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                 --feat-type {feat_type} \
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
-                --left-context {left_context} --right-context {right_context} \
+                --left-context {left_context} \
+                --right-context {right_context} \
+                --left-context-initial {left_context_initial} \
+                --right-context-final {right_context_final} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
                 --frames-per-eg {frames_per_eg_str} \
@@ -71,7 +75,10 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                    ivector_dir=(online_ivector_dir
                                 if online_ivector_dir is not None
                                 else ''),
-                   left_context=left_context, right_context=right_context,
+                   left_context=left_context,
+                   right_context=right_context,
+                   left_context_initial=left_context_initial,
+                   right_context_final=right_context_final,
                    stage=stage, samples_per_iter=samples_per_iter,
                    frames_per_eg_str=frames_per_eg_str, srand=srand,
                    num_targets=num_targets,
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 9018c2e2472..94bf322a514 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -32,6 +32,8 @@ alignment_subsampling_factor=3 # frames-per-second of input alignments divided
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
@@ -90,8 +92,10 @@ if [ $# != 4 ]; then
   echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
   echo "  --frames-per-eg <frames;25>                      # number of supervised frames per eg on disk"
   echo "  --frames-overlap-per-eg <frames;25>              # number of supervised frames of overlap between egs"
-  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
-  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   echo "  --num-egs-diagnostic <#frames;4000>              # Number of egs used in computing (train,valid) diagnostics"
   echo "  --num-valid-egs-combine <#frames;10000>          # Number of egss used in getting combination weights at the"
   echo "                                                   # very end."
@@ -264,7 +268,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
-
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 if [ -e $dir/storage ]; then
@@ -288,6 +294,9 @@ fi
 
 
 egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
+
 
 chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
 [ ! -z $right_tolerance ] && \
@@ -298,6 +307,8 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
 
 if [ $stage -le 3 ]; then
   echo "$0: Getting validation and training subset examples."
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index c25c484fb8d..6f0bc9c5243 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -296,6 +296,10 @@ def train(args, run_opts, background_process_handler):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    left_context_initial = (args.chunk_left_context_initial + model_left_context if
+                            args.chunk_left_context_initial >= 0 else -1)
+    right_context_final = (args.chunk_right_context_final + model_right_context if
+                           args.chunk_right_context_final >= 0 else -1)
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
@@ -319,8 +323,12 @@ def train(args, run_opts, background_process_handler):
                     {dir}/init.raw""".format(command=run_opts.command,
                                              dir=args.dir))
 
-    egs_left_context = left_context + args.frame_subsampling_factor/2
-    egs_right_context = right_context + args.frame_subsampling_factor/2
+    egs_left_context = left_context + args.frame_subsampling_factor / 2
+    egs_right_context = right_context + args.frame_subsampling_factor / 2
+    egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if
+                                left_context_initial >= 0 else -1)
+    egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if
+                               right_context_final >= 0 else -1)
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
@@ -331,6 +339,8 @@ def train(args, run_opts, background_process_handler):
             lat_dir=args.lat_dir, egs_dir=default_egs_dir,
             left_context=egs_left_context,
             right_context=egs_right_context,
+            left_context_initial=egs_left_context_initial,
+            right_context_final=egs_right_context_final,
             run_opts=run_opts,
             left_tolerance=args.left_tolerance,
             right_tolerance=args.right_tolerance,
@@ -353,7 +363,9 @@ def train(args, run_opts, background_process_handler):
     [egs_left_context, egs_right_context,
      frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
-                                        egs_left_context, egs_right_context))
+                                        egs_left_context, egs_right_context,
+                                        egs_left_context_initial,
+                                        egs_right_context_final)
     assert(args.chunk_width == frames_per_eg_str)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index e442dce9032..6622f3632f7 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -24,6 +24,8 @@ frames_per_eg=8   # number of frames of labels per example.  more->less disk spa
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
@@ -74,8 +76,10 @@ if [ $# != 3 ]; then
   echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
   echo "                                                   # sizes).  The first in the list is preferred and is used"
   echo "                                                   # when working out the number of archives etc."
-  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
-  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
   echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
   echo "                                                   # very end."
@@ -245,6 +249,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 
@@ -265,9 +272,15 @@ if [ $stage -le 2 ]; then
 fi
 
 egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
+
+
 num_pdfs=$(tree-info --print-args=false $alidir/tree | grep num-pdfs | awk '{print $2}')
 if [ $stage -le 3 ]; then
   echo "$0: Getting validation and training subset examples."
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index a8b6b3376b5..406b998fc71 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -20,6 +20,8 @@ frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 adjust_priors=true
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
@@ -75,6 +77,10 @@ if [ $# != 6 ]; then
   echo "                                                   # the middle."
   echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
   echo "                                                   # online-neural-net setup."
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   exit 1;
 fi
 
@@ -250,6 +256,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 if [ -e $dir/storage ]; then
@@ -274,21 +283,36 @@ fi
 
 splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt"
 
+
+# If frame_subsampling_factor > 0, we will later be shifting the egs slightly to
+# the left or right as part of training, so we see (e.g.) all shifts of the data
+# modulo 3... we need to extend the l/r context slightly to account for this, to
+# ensure we see the entire context that the model requires.
 left_context=$[left_context+frame_subsampling_factor/2]
 right_context=$[right_context+frame_subsampling_factor/2]
+[ $left_context_initial -ge 0 ] && left_context_initial=$[left_context_initial+frame_subsampling_factor/2]
+[ $right_context_final -ge 0 ] && right_context_final=$[right_context_final+frame_subsampling_factor/2]
 
 egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress --frame-subsampling-factor=$frame_subsampling_factor $splitter_opts"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
+
 
 # don't do the overlap thing for the priors computation data-- but do use the
 # same num-frames for the eg, which would be much more efficient in case it's a
 # recurrent model and has a lot of frames of context.  In any case we're not
 # doing SGD so there is no benefit in having short chunks.
 priors_egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress"
+[ $left_context_initial -ge 0 ] && priors_egs_opts="$priors_egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && priors_egs_opts="$priors_egs_opts --right-context-final=$right_context_final"
+
 
 supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
 
 echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor
 
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index eeac84db969..7bd8fa5f983 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -32,6 +32,8 @@ frames_per_eg=8   # number of frames of labels per example.  more->less disk spa
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 num_utts_subset=300     # number of utterances in validation and training
@@ -81,8 +83,10 @@ if [ $# != 3 ]; then
   echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
   echo "                                                   # sizes).  The first in the list is preferred and is used"
   echo "                                                   # when working out the number of archives etc."
-  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
-  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
   echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
   echo "                                                   # very end."
@@ -238,6 +242,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 
@@ -252,9 +259,13 @@ if [ -e $dir/storage ]; then
 fi
 
 egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
 
 for n in `seq $nj`; do
   utils/filter_scp.pl $sdata/$n/utt2spk $targets_scp > $dir/targets.$n.scp
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 205b6034fad..b80a8d4045b 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ../xconfig_to_configs.py
+
 from __future__ import print_function
 import os
 import argparse
diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
index 7f3aba2328c..b442ce9715b 100755
--- a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ./xconfig_to_configs.py
+
 # tdnn or RNN with 'jesus layer'
 
 #  inputs to jesus layer:
diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
index 471911906c5..162fda16d16 100644
--- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ../xconfig_to_configs.py
+
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import re, os, argparse, sys, math, warnings
 
 
-
 parser = argparse.ArgumentParser(description="Writes config files and variables "
                                  "for TDNNs creation and training",
                                  epilog="See steps/nnet3/train_tdnn.sh for example.");
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 48c13a1236c..5445b16e165 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ../xconfig_to_configs.py
+
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import os
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 3270d114503..b5ed26499a4 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -36,19 +36,18 @@
 def get_args():
     """ Get args from stdin.
 
-    We add compulsary arguments as named arguments for readability
+    We add compulsory arguments as named arguments for readability
 
     The common options are defined in the object
     libs.nnet3.train.common.CommonParser.parser.
     See steps/libs/nnet3/train/common.py
     """
-
     parser = argparse.ArgumentParser(
         description="""Trains a feed forward DNN acoustic model using the
         cross-entropy objective.  DNNs include simple DNNs, TDNNs and CNNs.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(include_chunk_context = False).parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -191,8 +190,8 @@ def train(args, run_opts, background_process_handler):
         raise Exception("KeyError {0}: Variables need to be defined in "
                         "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
+    left_context = model_left_context
+    right_context = model_right_context
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index b853d77cb27..a26e0aa75cf 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -47,7 +47,7 @@ def get_args():
         DNNs include simple DNNs, TDNNs and CNNs.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(include_chunk_context = False).parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -184,8 +184,8 @@ def train(args, run_opts, background_process_handler):
         raise Exception("KeyError {0}: Variables need to be defined in "
                         "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
+    left_context = model_left_context
+    right_context = model_right_context
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index e65be7a443a..c2a6028b930 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -55,7 +55,7 @@ def get_args():
             3. RNNs can also be trained with state preservation training""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(default_chunk_left_context=40).parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
@@ -65,11 +65,6 @@ def get_args():
                         should halve --trainer.samples-per-iter.  May be
                         a comma-separated list of alternatives: first width
                         is the 'principal' chunk-width, used preferentially""")
-    parser.add_argument("--egs.chunk-left-context", type=int,
-                        dest='chunk_left_context', default=40,
-                        help="""Number of left steps used in the estimation of
-                        LSTM state before prediction of the first label.
-                        Overrides the default value in CommonParser""")
 
     # trainer options
     parser.add_argument("--trainer.samples-per-iter", type=int,
@@ -258,6 +253,10 @@ def train(args, run_opts, background_process_handler):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    left_context_initial = (args.chunk_left_context_initial + model_left_context if
+                            args.chunk_left_context_initial >= 0 else -1)
+    right_context_final = (args.chunk_right_context_final + model_right_context if
+                           args.chunk_right_context_final >= 0 else -1)
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
@@ -298,7 +297,10 @@ def train(args, run_opts, background_process_handler):
         train_lib.raw_model.generate_egs_using_targets(
             data=args.feat_dir, targets_scp=args.targets_scp,
             egs_dir=default_egs_dir,
-            left_context=left_context, right_context=right_context,
+            left_context=left_context,
+            right_context=right_context,
+            left_context_initial=left_context_initial,
+            right_context_final=right_context_final,
             run_opts=run_opts,
             frames_per_eg_str=args.chunk_width,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 27ab8c68b14..422540aee35 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -56,7 +56,7 @@ def get_args():
             3. RNNs can also be trained with state preservation training""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(default_chunk_left_context = 40).parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
@@ -66,11 +66,6 @@ def get_args():
                         should halve --trainer.samples-per-iter.  May be
                         a comma-separated list of alternatives: first width
                         is the 'principal' chunk-width, used preferentially""")
-    parser.add_argument("--egs.chunk-left-context", type=int,
-                        dest='chunk_left_context', default=40,
-                        help="""Number of left steps used in the estimation of
-                        LSTM state before prediction of the first label""")
-
     parser.add_argument("--trainer.samples-per-iter", type=int,
                         dest='samples_per_iter', default=20000,
                         help="""This is really the number of egs in each
@@ -263,6 +258,10 @@ def train(args, run_opts, background_process_handler):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    left_context_initial = (args.chunk_left_context_initial + model_left_context if
+                            args.chunk_left_context_initial >= 0 else -1)
+    right_context_final = (args.chunk_right_context_final + model_right_context if
+                           args.chunk_right_context_final >= 0 else -1)
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
@@ -283,8 +282,12 @@ def train(args, run_opts, background_process_handler):
         logger.info("Generating egs")
 
         train_lib.acoustic_model.generate_egs(
-            data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
-            left_context=left_context, right_context=right_context,
+            data=args.feat_dir, alidir=args.ali_dir,
+            egs_dir=default_egs_dir,
+            left_context=left_context,
+            right_context=right_context,
+            left_context_initial=left_context_initial,
+            right_context_final=right_context_final,
             run_opts=run_opts,
             frames_per_eg_str=args.chunk_width,
             srand=args.srand,
@@ -303,7 +306,8 @@ def train(args, run_opts, background_process_handler):
     [egs_left_context, egs_right_context,
      frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
-                                        left_context, right_context))
+                                        left_context, right_context,
+                                        left_context_initial, right_context_final))
     if args.chunk_width != frames_per_eg_str:
         raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
                         "in the egs dir {0} vs {1}".(args.chunk_width, frames_per_eg_str))

From 588bfd487ecfec4c2dd5c9103d217ef517b0fa5e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 31 Dec 2016 00:07:29 -0500
Subject: [PATCH 040/213] Add diagnostics to UtteranceSplitter; Various fixes.

---
 egs/wsj/s5/steps/libs/nnet3/train/common.py  |   7 +-
 egs/wsj/s5/steps/nnet3/chain/train.py        |   2 +-
 src/chainbin/nnet3-chain-get-egs.cc          |  53 ++---
 src/nnet3/nnet-chain-example.h               |   2 +-
 src/nnet3/nnet-discriminative-example.h      |   2 +-
 src/nnet3/nnet-example-utils.cc              |  75 +++++-
 src/nnet3/nnet-example-utils.h               |  35 ++-
 src/nnet3bin/nnet3-discriminative-get-egs.cc |  48 ++--
 src/nnet3bin/nnet3-get-egs-dense-targets.cc  | 228 ++++++++++---------
 src/nnet3bin/nnet3-get-egs.cc                |  44 ++--
 10 files changed, 286 insertions(+), 210 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 60bd2e69d5a..1edaf3972cb 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -543,10 +543,11 @@ class CommonParser:
     in steps/nnet3/train*.py and steps/nnet3/chain/train.py
     """
 
-    parser = argparse.ArgumentParser(add_help=False,
-                                     default_chunk_left_context=0)
+    parser = argparse.ArgumentParser(add_help=False)
 
-    def __init__(self, include_chunk_context = True):
+    def __init__(self,
+                 include_chunk_context = True,
+                 default_chunk_left_context=0):
         # feat options
         self.parser.add_argument("--feat.online-ivector-dir", type=str,
                                  dest='online_ivector_dir', default=None,
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 6f0bc9c5243..42d302c34a0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -365,7 +365,7 @@ def train(args, run_opts, background_process_handler):
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                                         egs_left_context, egs_right_context,
                                         egs_left_context_initial,
-                                        egs_right_context_final)
+                                        egs_right_context_final))
     assert(args.chunk_width == frames_per_eg_str)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 2a8f5a1c6ad..bf1e87d2452 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -45,20 +45,19 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         const chain::Supervision &supervision,
                         const std::string &utt_id,
                         bool compress,
-                        const UtteranceSplitter &utt_splitter,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetChainExampleWriter *example_writer) {
-  bool ans = true;
   KALDI_ASSERT(supervision.num_sequences == 1);
   int32 num_input_frames = feats.NumRows(),
       num_output_frames = supervision.frames_per_sequence;
 
-  if (!utt_splitter.LengthsMatch(utt_id, num_input_frames, num_output_frames))
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames))
     return false;  // LengthsMatch() will have printed a warning.
 
   std::vector<ChunkTimeInfo> chunks;
 
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
   if (chunks.empty()) {
     KALDI_WARN << "Not producing egs for utterance " << utt_id
                << " because it is too short: "
@@ -66,11 +65,9 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
     return false;
   }
 
-  int32 frame_subsampling_factor = utt_splitter.Config().frame_subsampling_factor;
-
-  utt_splitter.GetChunksForUtterance(num_input_frames, &chunks);
+  int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
 
-  chain::SupervisionSplitter splitter(supervision);
+  chain::SupervisionSplitter sup_splitter(supervision);
 
   for (size_t c = 0; c < chunks.size(); c++) {
     ChunkTimeInfo &chunk = chunks[c];
@@ -79,9 +76,9 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
         num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
 
     chain::Supervision supervision_part;
-    splitter.GetFrameRange(start_frame_subsampled,
-                           num_frames_subsampled,
-                           &supervision_part);
+    sup_splitter.GetFrameRange(start_frame_subsampled,
+                               num_frames_subsampled,
+                               &supervision_part);
 
     if (normalization_fst.NumStates() > 0 &&
         !AddWeightToSupervisionFst(normalization_fst,
@@ -91,7 +88,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                  << (chunk.first_frame + chunk.num_frames)
                  << ", FST was empty after composing with normalization FST. "
                  << "This should be extremely rare (a few per corpus, at most)";
-      ans = false;
     }
 
     int32 first_frame = 0;  // we shift the time-indexes of all these parts so
@@ -154,12 +150,9 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += chunk.num_frames;
-    *num_egs_written += 1;
-
     example_writer->Write(key, nnet_chain_eg);
   }
-  return ans;
+  return true;
 }
 
 } // namespace nnet2
@@ -256,8 +249,7 @@ int main(int argc, char *argv[]) {
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
 
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
+    int32 num_err = 0;
 
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
@@ -290,23 +282,18 @@ int main(int argc, char *argv[]) {
           continue;
         }
 
-        if (ProcessFile(normalization_fst, feats,
-                        online_ivector_feats, online_ivector_period,
-                        supervision, key, compress, utt_splitter,
-                        &num_frames_written, &num_egs_written,
-                        &example_writer))
-          num_done++;
-        else
+        if (!ProcessFile(normalization_fst, feats,
+                         online_ivector_feats, online_ivector_period,
+                         supervision, key, compress,
+                         &utt_splitter, &example_writer))
           num_err++;
       }
     }
-
-    KALDI_LOG << "Finished generating nnet3-chain examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " frames in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints stats in its destructor.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 87b2de77897..24e68116193 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -268,7 +268,7 @@ class ChainExampleMerger {
   int32 num_egs_written_;
   const ExampleMergingConfig &config_;
   NnetChainExampleWriter *writer_;
-  ExampleSizeStats stats_;
+  ExampleMergingStats stats_;
 
   // Note: the "key" into the egs is the first element of the vector.
   typedef unordered_map<NnetChainExample*,
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index 3a170e6bbd6..048ee32c4e8 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -266,7 +266,7 @@ class DiscriminativeExampleMerger {
   int32 num_egs_written_;
   const ExampleMergingConfig &config_;
   NnetDiscriminativeExampleWriter *writer_;
-  ExampleSizeStats stats_;
+  ExampleMergingStats stats_;
 
   // Note: the "key" into the egs is the first element of the vector.
   typedef unordered_map<NnetDiscriminativeExample*,
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 77395759d8d..574ff6fbfae 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -325,7 +325,10 @@ void ExampleGenerationConfig::ComputeDerived() {
 
 
 UtteranceSplitter::UtteranceSplitter(const ExampleGenerationConfig &config):
-    config_(config) {
+    config_(config),
+    total_num_utterances_(0), total_input_frames_(0),
+    total_frames_overlap_(0), total_num_chunks_(0),
+    total_frames_in_chunks_(0) {
   if (config.num_frames.empty()) {
     KALDI_ERR << "You need to call ComputeDerived() on the "
                  "ExampleGenerationConfig().";
@@ -333,6 +336,37 @@ UtteranceSplitter::UtteranceSplitter(const ExampleGenerationConfig &config):
   InitSplitForLength();
 }
 
+UtteranceSplitter::~UtteranceSplitter() {
+  KALDI_LOG << "Split " << total_num_utterances_ << " utts, with "
+            << "total length " << total_input_frames_ << " frames ("
+            << (total_input_frames_ / 360000.0) << " hours assuming "
+            << "100 frames per second)";
+  float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_,
+      overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_,
+      output_percent = total_frames_in_chunks_ * 100.0 / total_input_frames_,
+      output_percent_no_overlap = output_percent - overlap_percent;
+
+  KALDI_LOG << "Average chunk length was " << average_chunk_length
+            << " frames; overlap between adjacent chunks was "
+            << overlap_percent << "% of input length; length of output was "
+            << output_percent << "% of input length (minus overlap = "
+            << output_percent_no_overlap << "%).";
+  if (chunk_size_to_count_.size() > 1) {
+    std::ostringstream os;
+    os << std::setprecision(4);
+    for (std::map<int32, int32>::iterator iter = chunk_size_to_count_.begin();
+         iter != chunk_size_to_count_.end(); ++iter) {
+      int32 chunk_size = iter->first,
+          num_frames = chunk_size * iter->second;
+      float percent_of_total = num_frames * 100.0 / total_frames_in_chunks_;
+      if (iter != chunk_size_to_count_.begin()) os << ", ";
+      os << chunk_size << " = " << percent_of_total << "%";
+    }
+    KALDI_LOG << "Output frames are distributed among chunk-sizes as follows: "
+              << os.str();
+  }
+}
+
 float UtteranceSplitter::DefaultDurationOfSplit(
     const std::vector<int32> &split) const {
   if (split.empty())  // not a valid split, but useful to handle this case.
@@ -761,7 +795,7 @@ void UtteranceSplitter::GetGapSizes(int32 utterance_length,
 
 void UtteranceSplitter::GetChunksForUtterance(
     int32 utterance_length,
-    std::vector<ChunkTimeInfo> *chunk_info) const {
+    std::vector<ChunkTimeInfo> *chunk_info) {
   std::vector<int32> chunk_sizes;
   GetChunkSizesForUtterance(utterance_length, &chunk_sizes);
   std::vector<int32> gaps(chunk_sizes.size());
@@ -780,12 +814,39 @@ void UtteranceSplitter::GetChunksForUtterance(
                           config_.right_context_final : config_.right_context);
     t += chunk_sizes[i];
   }
+  AccStatsForUtterance(utterance_length, *chunk_info);
   // check that the end of the last chunk doesn't go more than
   // 'config_.frame_subsampling_factor - 1' frames past the end
   // of the utterance.  That amount, we treat as rounding error.
   KALDI_ASSERT(t - utterance_length < config_.frame_subsampling_factor);
 }
 
+void UtteranceSplitter::AccStatsForUtterance(
+    int32 utterance_length,
+    const std::vector<ChunkTimeInfo> &chunk_info) {
+  total_num_utterances_ += 1;
+  total_input_frames_ += utterance_length;
+
+  for (size_t c = 0; c < chunk_info.size(); c++) {
+    int32 chunk_size = chunk_info[c].num_frames;
+    if (c > 0) {
+      int32 last_chunk_end = chunk_info[c-1].first_frame +
+          chunk_info[c-1].num_frames;
+      if (last_chunk_end > chunk_info[c].first_frame)
+        total_frames_overlap_ += last_chunk_end - chunk_info[c].first_frame;
+    }
+    std::map<int32, int32>::iterator iter = chunk_size_to_count_.find(
+        chunk_size);
+    if (iter == chunk_size_to_count_.end())
+      chunk_size_to_count_[chunk_size] = 1;
+    else
+      iter->second++;
+    total_num_chunks_ += 1;
+    total_frames_in_chunks_ += chunk_size;
+  }
+}
+
+
 void UtteranceSplitter::SetOutputWeights(
     int32 utterance_length,
     std::vector<ChunkTimeInfo> *chunk_info) const {
@@ -951,7 +1012,7 @@ int32 ExampleMergingConfig::MinibatchSize(int32 size_of_eg,
 }
 
 
-void ExampleSizeStats::WroteExample(int32 example_size,
+void ExampleMergingStats::WroteExample(int32 example_size,
                                     size_t structure_hash,
                                     int32 minibatch_size) {
   std::pair<int32, size_t> p(example_size, structure_hash);
@@ -965,7 +1026,7 @@ void ExampleSizeStats::WroteExample(int32 example_size,
     iter->second += 1;
 }
 
-void ExampleSizeStats::DiscardedExamples(int32 example_size,
+void ExampleMergingStats::DiscardedExamples(int32 example_size,
                                          size_t structure_hash,
                                          int32 num_discarded) {
   std::pair<int32, size_t> p(example_size, structure_hash);
@@ -973,12 +1034,12 @@ void ExampleSizeStats::DiscardedExamples(int32 example_size,
 }
 
 
-void ExampleSizeStats::PrintStats() const {
+void ExampleMergingStats::PrintStats() const {
   PrintAggregateStats();
   PrintSpecificStats();
 }
 
-void ExampleSizeStats::PrintAggregateStats() const {
+void ExampleMergingStats::PrintAggregateStats() const {
   // First print some aggregate stats.
   int64 num_distinct_egs_types = 0,  // number of distinct types of input egs
                                      // (differing in size or structure).
@@ -1042,7 +1103,7 @@ void ExampleSizeStats::PrintAggregateStats() const {
   KALDI_LOG << os.str();
 }
 
-void ExampleSizeStats::PrintSpecificStats() const {
+void ExampleMergingStats::PrintSpecificStats() const {
   KALDI_LOG << "Merged specific eg types as follows [format: <eg-size1>="
       "{<mb-size1>-><num-minibatches1>,<mbsize2>-><num-minibatches2>.../d=<num-discarded>}"
       ",<egs-size2>={...},... (note,egs-size == number of input "
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 46d6906ff99..66624f69004 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -180,8 +180,9 @@ class UtteranceSplitter {
   // Given an utterance length, this function creates for you a list of chunks
   // into which to split the utterance.  Note: this is partly random (will call
   // srand()).
+  // Accumulates some stats which will be printed out in the destructor.
   void GetChunksForUtterance(int32 utterance_length,
-                             std::vector<ChunkTimeInfo> *chunk_info) const;
+                             std::vector<ChunkTimeInfo> *chunk_info);
 
 
   // This function returns true if 'supervision_length' (e.g. the length of the
@@ -194,6 +195,9 @@ class UtteranceSplitter {
                     int32 utterance_length,
                     int32 supervision_length) const;
 
+  ~UtteranceSplitter();
+
+  int32 ExitStatus() { return (total_frames_in_chunks_ > 0); }
 
  private:
 
@@ -250,7 +254,6 @@ class UtteranceSplitter {
                    const std::vector<int32> &chunk_sizes,
                    std::vector<int32> *gap_sizes) const;
 
-
   // this static function, used in GetGapSizes(), writes random values to a
   // vector 'vec' such the sum of those values equals n (n may be positive or
   // negative).  It tries to make those values as similar as possible (they will
@@ -270,9 +273,13 @@ class UtteranceSplitter {
 
   // This function is responsible for setting the 'output_weights'
   // members of the chunks.
-  void SetOutputWeights(int32 utterance_lengths,
+  void SetOutputWeights(int32 utterance_length,
                         std::vector<ChunkTimeInfo> *chunk_info) const;
 
+  // Accumulate stats for diagnostics.
+  void AccStatsForUtterance(int32 utterance_length,
+                            const std::vector<ChunkTimeInfo> &chunk_info);
+
 
   const ExampleGenerationConfig &config_;
 
@@ -295,6 +302,21 @@ class UtteranceSplitter {
   // chunks, and then add the subtracted number of copies of the primary
   // num-frames to the split.
   std::vector<std::vector<std::vector<int32> > > splits_for_length_;
+
+  // Below are stats used for diagnostics.
+  int32 total_num_utterances_;  // total input utterances.
+  int64 total_input_frames_;  // total num-frames over all utterances (before
+                              // splitting)
+  int64 total_frames_overlap_;  // total number of frames that overlap between
+                                // adjacent egs.
+  int64 total_num_chunks_;
+  int64 total_frames_in_chunks_;  // total of chunk-size times count of that
+                                  // chunk.  equals the num-frames in all the
+                                  // output chunks, added up.
+  std::map<int32, int32> chunk_size_to_count_;  // for each chunk size, gives
+                                                // the number of chunks with
+                                                // that size.
+
 };
 
 
@@ -403,9 +425,8 @@ int32 GetNnetExampleSize(const NnetExample &a);
 /// statistics about how examples of different sizes (c.f. GetNnetExampleSize())
 /// were merged into minibatches, and how many examples were left over and
 /// discarded.
-class ExampleSizeStats {
+class ExampleMergingStats {
  public:
-
   /// Users call this function to inform this class that one minibatch has been
   /// written aggregating 'minibatch_size' separate examples of original size
   /// 'example_size' (e.g. as determined by GetNnetExampleSize(), but the caller
@@ -475,7 +496,7 @@ class ExampleMerger {
   void Finish();
 
   // returns a suitable exit status for a program.
-  bool ExitStatus() { return num_egs_written_ > 0; }
+  bool ExitStatus() { return num_egs_written_ > 0 ? 0 : 1; }
 
   ~ExampleMerger() { Finish(); };
  private:
@@ -487,7 +508,7 @@ class ExampleMerger {
   int32 num_egs_written_;
   const ExampleMergingConfig &config_;
   NnetExampleWriter *writer_;
-  ExampleSizeStats stats_;
+  ExampleMergingStats stats_;
 
   // Note: the "key" into the egs is the first element of the vector.
   typedef unordered_map<NnetExample*, std::vector<NnetExample*>,
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
index 6055dc3d20c..070a88b331d 100644
--- a/src/nnet3bin/nnet3-discriminative-get-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -34,6 +34,8 @@ namespace nnet3 {
 
 // This function does all the processing for one utterance, and outputs the
 // examples to 'example_writer'.
+// returns true if we got as far as calling GetChunksForUtterance()
+// [in which case stats will be accumulated by class UtteranceSplitter]
 static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOptions &config,
                         const TransitionModel &tmodel,
                         const MatrixBase<BaseFloat> &feats,
@@ -42,28 +44,26 @@ static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOpti
                         const discriminative::DiscriminativeSupervision &supervision,
                         const std::string &utt_id,
                         bool compress,
-                        const UtteranceSplitter &utt_splitter,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetDiscriminativeExampleWriter *example_writer) {
   KALDI_ASSERT(supervision.num_sequences == 1);
   int32 num_input_frames = feats.NumRows(),
       num_output_frames = supervision.frames_per_sequence;
 
-  if (!utt_splitter.LengthsMatch(utt_id, num_input_frames, num_output_frames))
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames))
     return false;  // LengthsMatch() will have printed a warning.
 
   std::vector<ChunkTimeInfo> chunks;
 
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
   if (chunks.empty()) {
     KALDI_WARN << "Not producing egs for utterance " << utt_id
                << " because it is too short: "
                << num_input_frames << " frames.";
   }
 
-  int32 frame_subsampling_factor = utt_splitter.Config().frame_subsampling_factor;
-
-  utt_splitter.GetChunksForUtterance(num_input_frames, &chunks);
+  int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
 
   discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel,
                                                              supervision);
@@ -143,9 +143,6 @@ static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOpti
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += chunk.num_frames;
-    *num_egs_written += 1;
-
     example_writer->Write(key, nnet_discriminative_eg);
   }
   return true;
@@ -187,6 +184,8 @@ int main(int argc, char *argv[]) {
     discriminative::SplitDiscriminativeSupervisionOptions splitter_config;
 
     ParseOptions po(usage);
+
+    eg_config.Register(&po);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format (recommended)");
     po.Register("ivectors", &online_ivector_rspecifier, "Alias for --online-ivectors "
@@ -198,7 +197,7 @@ int main(int argc, char *argv[]) {
                 "option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    eg_config.Register(&po);
+
 
     ParseOptions splitter_opts("supervision-splitter", &po);
     splitter_config.Register(&splitter_opts);
@@ -236,8 +235,7 @@ int main(int argc, char *argv[]) {
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
 
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
+    int32 num_err = 0;
 
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
@@ -269,22 +267,18 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
-
-        if (ProcessFile(splitter_config, tmodel,
-                        feats, online_ivector_feats, online_ivector_period,
-                        supervision, key, compress, utt_splitter,
-                        &num_frames_written, &num_egs_written,
-                        &example_writer)) num_done++;
-        else num_err++;
+        if (!ProcessFile(splitter_config, tmodel,
+                         feats, online_ivector_feats, online_ivector_period,
+                         supervision, key, compress,
+                         &utt_splitter, &example_writer))
+          num_err++;
       }
     }
-
-    KALDI_LOG << "Finished generating nnet3-discriminative examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " frames in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints diagnostics.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/nnet3bin/nnet3-get-egs-dense-targets.cc b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
index 23bf8922a5b..ddcf5f23555 100644
--- a/src/nnet3bin/nnet3-get-egs-dense-targets.cc
+++ b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
@@ -25,6 +25,7 @@
 #include "hmm/transition-model.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -32,101 +33,121 @@ namespace nnet3 {
 
 static void ProcessFile(const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const MatrixBase<BaseFloat> &targets,
                         const std::string &utt_id,
                         bool compress,
                         int32 num_targets,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(targets.NumRows()));
-  
-  for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) {
-
-    // actual_frames_per_eg is the number of frames with actual targets.
-    // At the end of the file, we pad with the last frame repeated
-    // so that all examples have the same structure (prevents the need
-    // for recompilations).
-    // TODO: We might need to ignore the end of the file.
-    int32 actual_frames_per_eg = std::min(frames_per_eg,
-                                          feats.NumRows() - t);
-
-
-    int32 tot_frames = left_context + frames_per_eg + right_context;
-
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols());
-    
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
-      int32 t2 = j + t;
+  int32 num_input_frames = feats.NumRows();
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames,
+                                  targets.NumRows())) {
+    if (targets.NumRows() == 0)
+      return;
+    // normally we wouldn't process such an utterance but there may be
+    // situations when a small disagreement is acceptable.
+    KALDI_WARN << " .. processing this utterance anyway.";
+  }
+  KALDI_ASSERT(num_targets < 0 || targets.NumCols() == num_targets);
+
+  std::vector<ChunkTimeInfo> chunks;
+
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
+    return;
+  }
+
+  // 'frame_subsampling_factor' is not used in any recipes at the time of
+  // writing, this is being supported to unify the code with the 'chain' recipes
+  // and in case we need it for some reason in future.
+  int32 frame_subsampling_factor =
+      utt_splitter->Config().frame_subsampling_factor;
+
+  for (size_t c = 0; c < chunks.size(); c++) {
+    const ChunkTimeInfo &chunk = chunks[c];
+
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
+
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
+
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
       if (t2 < 0) t2 = 0;
-      if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
       SubVector<BaseFloat> src(feats, t2),
-          dest(input_frames, j + left_context);
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
 
     NnetExample eg;
-    
+
     // call the regular input "input".
-    eg.io.push_back(NnetIo("input", - left_context,
-                           input_frames));
+    eg.io.push_back(NnetIo("input", -chunk.left_context, input_frames));
 
-    // if applicable, add the iVector feature.
     if (ivector_feats != NULL) {
-      // try to get closest frame to middle of window to get
-      // a representative iVector.
-      int32 closest_frame = t + (actual_frames_per_eg / 2);
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (closest_frame >= ivector_feats->NumRows())
-        closest_frame = ivector_feats->NumRows() - 1;
+      // if applicable, add the iVector feature.
+      // choose iVector from a random frame in the chunk
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       eg.io.push_back(NnetIo("ivector", 0, ivector));
     }
 
+    // Note: chunk.first_frame and chunk.num_frames will both be
+    // multiples of frame_subsampling_factor.
+    // We expect frame_subsampling_factor to usually be 1 for now.
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
+
+    KALDI_ASSERT(start_frame_subsampled + num_frames_subsampled - 1 <
+                 targets.NumRows());
+
+
     // add the labels.
-    Matrix<BaseFloat> targets_dest(frames_per_eg, targets.NumCols());
-    for (int32 i = 0; i < actual_frames_per_eg; i++) {
+    Matrix<BaseFloat> targets_part(num_frames_subsampled, targets.NumCols());
+    for (int32 i = 0; i < num_frames_subsampled; i++) {
       // Copy the i^th row of the target matrix from the (t+i)^th row of the
       // input targets matrix
-      SubVector<BaseFloat> this_target_dest(targets_dest, i);
-      SubVector<BaseFloat> this_target_src(targets, t+i);
-      this_target_dest.CopyFromVec(this_target_src);
-    } 
-    
-    // Copy the last frame's target to the padded frames
-    for (int32 i = actual_frames_per_eg; i < frames_per_eg; i++) {
-      // Copy the i^th row of the target matrix from the last row of the 
-      // input targets matrix
-      KALDI_ASSERT(t + actual_frames_per_eg - 1 == feats.NumRows() - 1);
-      SubVector<BaseFloat> this_target_dest(targets_dest, i);
-      SubVector<BaseFloat> this_target_src(targets, t+actual_frames_per_eg-1);
+      int32 t = i + start_frame_subsampled;
+      if (t >= targets.NumRows())
+        t = targets.NumRows() - 1;
+      SubVector<BaseFloat> this_target_dest(targets_part, i);
+      SubVector<BaseFloat> this_target_src(targets, t);
       this_target_dest.CopyFromVec(this_target_src);
-    } 
+    }
 
     // push this created targets matrix into the eg
-    eg.io.push_back(NnetIo("output", 0, targets_dest));
-    
+    eg.io.push_back(NnetIo("output", 0, targets_part));
+
     if (compress)
       eg.Compress();
-      
+
     std::ostringstream os;
-    os << utt_id << "-" << t;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += actual_frames_per_eg;
-    *num_egs_written += 1;
-
     example_writer->Write(key, eg);
   }
 }
 
 
+
 } // namespace nnet2
 } // namespace kaldi
 
@@ -152,29 +173,31 @@ int main(int argc, char *argv[]) {
         "--right-context=9 --num-frames=8 \"$feats\" \\\n"
         "\"ark:copy-matrix ark:exp/snrs/snr.1.ark ark:- |\"\n"
         "   ark:- \n";
-        
+
 
     bool compress = true;
-    int32 num_targets = -1, left_context = 0, right_context = 0,
-        num_frames = 1, length_tolerance = 100;
-        
-    std::string ivector_rspecifier;
-    
+    int32 num_targets = -1, length_tolerance = 100, online_ivector_period = 1;
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
+
+    std::string online_ivector_rspecifier;
     ParseOptions po(usage);
+
+    eg_config.Register(&po);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format.");
-    po.Register("num-targets", &num_targets, "Number of targets for the neural network");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
-                "features, as matrix.");
+    po.Register("num-targets", &num_targets, "Output dimension in egs, "
+                "only used to check targets have correct dim if supplied.");
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
+                "--online-ivectors option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
+                "ivector features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -182,6 +205,9 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
 
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
+
     if (num_targets <= 0)
       KALDI_ERR << "--num-targets options is required.";
 
@@ -193,11 +219,10 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
     RandomAccessBaseFloatMatrixReader matrix_reader(matrix_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
-    
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(online_ivector_rspecifier);
+
+    int32 num_err = 0;
+
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
       const Matrix<BaseFloat> &feats = feat_reader.Value();
@@ -207,52 +232,47 @@ int main(int argc, char *argv[]) {
       } else {
         const Matrix<BaseFloat> &target_matrix = matrix_reader.Value(key);
         if (target_matrix.NumRows() != feats.NumRows()) {
-          KALDI_WARN << "Target matrix has wrong size " 
+          KALDI_WARN << "Target matrix has wrong size "
                      << target_matrix.NumRows()
                      << " versus " << feats.NumRows();
           num_err++;
           continue;
         }
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
 
-        if (ivector_feats != NULL &&
-            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - online_ivector_feats->NumRows()) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
+                     << " and iVectors " << online_ivector_feats->NumRows()
                      << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
-          
-        ProcessFile(feats, ivector_feats, target_matrix, key, compress,
-                    num_targets, left_context, right_context, num_frames,
-                    &num_frames_written, &num_egs_written,
-                    &example_writer);
-        num_done++;
+
+        ProcessFile(feats, online_ivector_feats, online_ivector_period,
+                    target_matrix, key, compress, num_targets,
+                    &utt_splitter, &example_writer);
       }
     }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " egs in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints stats in its destructor.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
-
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 6b9dacfa03d..562684c30ab 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -30,24 +30,24 @@ namespace kaldi {
 namespace nnet3 {
 
 
-static void ProcessFile(const MatrixBase<BaseFloat> &feats,
+static bool ProcessFile(const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
                         const Posterior &pdf_post,
                         const std::string &utt_id,
                         bool compress,
                         int32 num_pdfs,
-                        const UtteranceSplitter &utt_splitter,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetExampleWriter *example_writer) {
   int32 num_input_frames = feats.NumRows();
-  if (!utt_splitter.LengthsMatch(utt_id, num_input_frames,
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames,
                              static_cast<int32>(pdf_post.size())))
-    return;  // LengthsMatch() will have printed a warning.
+    return false;  // LengthsMatch() will have printed a warning.
 
   std::vector<ChunkTimeInfo> chunks;
 
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
   if (chunks.empty()) {
     KALDI_WARN << "Not producing egs for utterance " << utt_id
                << " because it is too short: "
@@ -58,9 +58,7 @@ static void ProcessFile(const MatrixBase<BaseFloat> &feats,
   // writing, this is being supported to unify the code with the 'chain' recipes
   // and in case we need it for some reason in future.
   int32 frame_subsampling_factor =
-      utt_splitter.Config().frame_subsampling_factor;
-
-  utt_splitter.GetChunksForUtterance(num_input_frames, &chunks);
+      utt_splitter->Config().frame_subsampling_factor;
 
   for (size_t c = 0; c < chunks.size(); c++) {
     const ChunkTimeInfo &chunk = chunks[c];
@@ -136,11 +134,9 @@ static void ProcessFile(const MatrixBase<BaseFloat> &feats,
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += chunk.num_frames;
-    *num_egs_written += 1;
-
     example_writer->Write(key, eg);
   }
+  return true;
 }
 
 } // namespace nnet3
@@ -222,8 +218,7 @@ int main(int argc, char *argv[]) {
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
 
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
+    int32 num_err = 0;
 
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
@@ -263,20 +258,17 @@ int main(int argc, char *argv[]) {
           continue;
         }
 
-        ProcessFile(feats, online_ivector_feats, online_ivector_period,
-                    pdf_post, key, compress, num_pdfs, utt_splitter,
-                    &num_frames_written, &num_egs_written,
-                    &example_writer);
-        num_done++;
+        if (!ProcessFile(feats, online_ivector_feats, online_ivector_period,
+                         pdf_post, key, compress, num_pdfs,
+                         &utt_splitter, &example_writer))
+            num_err++;
       }
     }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " egs in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints stats in its destructor.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;

From 00f4c41cf5dca1ba813312dfa036e0a89ea7ba91 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 1 Jan 2017 16:33:38 -0500
Subject: [PATCH 041/213] Various code and script fixes

---
 .../nnet3/train/chain_objf/acoustic_model.py  |  6 +-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 93 ++++++++++++-------
 .../nnet3/train/frame_level_objf/common.py    |  2 +-
 src/chainbin/nnet3-chain-copy-egs.cc          | 33 +++++--
 src/chainbin/nnet3-chain-merge-egs.cc         | 10 +-
 src/nnet3/nnet-chain-example.cc               |  5 +-
 src/nnet3/nnet-chain-example.h                |  8 +-
 src/nnet3/nnet-chain-training.cc              |  3 +-
 src/nnet3/nnet-component-itf.h                |  2 +-
 src/nnet3/nnet-discriminative-example.cc      |  6 +-
 src/nnet3/nnet-discriminative-example.h       |  9 +-
 src/nnet3/nnet-example-utils.cc               | 36 +++----
 src/nnet3/nnet-example-utils.h                | 31 ++++---
 src/nnet3/nnet-nnet.h                         |  4 +-
 src/nnet3/nnet-training.cc                    | 16 ++--
 src/nnet3/nnet-training.h                     |  5 +-
 src/nnet3bin/nnet3-copy-egs.cc                | 28 +++---
 .../nnet3-discriminative-merge-egs.cc         | 12 +--
 src/nnet3bin/nnet3-get-egs-dense-targets.cc   |  3 -
 src/nnet3bin/nnet3-merge-egs.cc               |  2 +-
 20 files changed, 177 insertions(+), 137 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index f2510ed8e18..48fc119ee96 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -301,7 +301,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         # changing too fast (i.e. it can worsen the objective function), and
         # the smaller minibatch size will help to keep the update stable.
         cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str(
-            cur_num_chunk_per_minibatch_str)
+            num_chunk_per_minibatch_str)
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
     raw_model_string = raw_model_string + dropout_edit_string
@@ -474,7 +474,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
                 "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
                     --right-context={rc} ark:{egs_dir}/valid_diagnostic.cegs \
-                    ark:- | nnet3-chain-merge-egs ark:- ark:- |" \
+                    ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
         """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                    lc=left_context, rc=right_context,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
@@ -489,7 +489,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
                 "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
                     --right-context={rc} ark:{egs_dir}/train_diagnostic.cegs \
-                    ark:- | nnet3-chain-merge-egs ark:- ark:- |" \
+                    ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
         """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                    lc=left_context, rc=right_context,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 1edaf3972cb..af8e9793f0a 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -161,13 +161,42 @@ def principal_chunk_width(chunk_width):
     return int(chunk_width.split(",")[0])
 
 
+def validate_range_str(range_str):
+    """Helper function used inside validate_minibatch_size_str().
+    Returns true if range_str is a a comma-separated list of
+    positive integers and ranges of integers, like '128',
+    '128,256', or '64-128,256'."""
+    if not isinstance(range_str, str):
+        return False
+    ranges = range_str.split(",")
+    assert len(ranges) > 0
+    for r in ranges:
+        # a range may be either e.g. '64', or '128-256'
+        try:
+            c = [ int(x) for x in r.split(":") ]
+        except:
+            return False
+        # c should be either e.g. [ 128 ], or  [64,128].
+        if len(c) == 1:
+            if c[0] <= 0:
+                return False
+        elif len(c) == 2:
+            if c[0] <= 0 or c[1] < c[0]:
+                return False
+        else:
+            return False
+    return True
+
+
+
 def validate_minibatch_size_str(minibatch_size_str):
     """Validate a minibatch-size string (returns bool).
-    A minibatch-size string might either be an integer, like '256'
-    or a rule like '128=64-128/256=32,64', whose format
+    A minibatch-size string might either be an integer, like '256',
+    a comma-separated set of integers or ranges like '128,256' or
+    '64:128,256',  or a rule like '128=64:128/256=32,64', whose format
     is: eg-length1=size-range1/eg-length2=size-range2/....
-    where the size-range is a comma-separated list of either integers
-    or ranges.  An arbitrary eg will be mapped to the size-range
+    where a size-range is a comma-separated list of either integers like '16'
+    or ranges like '16:32'.  An arbitrary eg will be mapped to the size-range
     for the closest of the listed eg-lengths (the eg-length is defined
     as the number of input frames, including context frames)."""
     if not isinstance(minibatch_size_str, str):
@@ -183,11 +212,7 @@ def validate_minibatch_size_str(minibatch_size_str):
             # one choice)... this would mean somebody just gave "25"
             # or something like that for the minibatch size.
             if len(a) == 1 and len(b) == 1:
-                try:
-                    mb_size = int(b[0])
-                    return mb_size > 0
-                except:
-                    return False
+                return validate_range_str(elem)
             else:
                 return False
         # check that the thing before the '=' sign is a positive integer
@@ -197,26 +222,29 @@ def validate_minibatch_size_str(minibatch_size_str):
                 return False
         except:
             return False  # not an integer at all.
-        # check the thing after the '=' sign is a comma-separated list of ranges
-        ranges = b[1].split(",")
-        assert len(ranges) > 0
-        for range in ranges:
-            # a range may be either e.g. '64', or '128-256'
-            try:
-                c = [ int(x) for x in range.split("-") ]
-            except:
-                return False
-            if len(c) == 1:
-                if c[0] <= 0:
-                    return False
-            elif len(c) == 2:
-                if c[0] <= 0 or c[1] < c[0]:
-                    return False
-            else:
-                return False
+
+        if not validate_range_str(b[1]):
+            return False
     return True
 
 
+def halve_range_str(range_str):
+    """Helper function used inside halve_minibatch_size_str().
+    returns half of a range [but converting resulting zeros to
+    ones], e.g. '16'->'8', '16,32'->'8,16', '64:128'->'32:64'.
+    Returns true if range_str is a a comma-separated list of
+    positive integers and ranges of integers, like '128',
+    '128,256', or '64-128,256'."""
+
+    ranges = range_str.split(",")
+    halved_ranges = []
+    for r in ranges:
+        # a range may be either e.g. '64', or '128:256'
+        c = [ str(max(1, int(x)/2)) for x in r.split(":") ]
+        halved_ranges.append(":".join(c))
+    return ','.join(halved_ranges)
+
+
 def halve_minibatch_size_str(minibatch_size_str):
     """Halve a minibatch-size string, as would be validated by
     validate_minibatch_size_str (see docs for that).  This halves
@@ -232,16 +260,10 @@ def halve_minibatch_size_str(minibatch_size_str):
         b = elem.split('=')
         # We expect b to have length 2 in the normal case.
         if len(b) == 1:
-            mb_size = int(b[0])
-            ans.append(str(max(1, mb_size / 2)))
+            return halve_range_str(elem)
         else:
             assert len(b) == 2
-            ranges_out = []
-            ranges = b[1].split(',')
-            for range in ranges:
-                c = [ str(max(1, int(x)/2)) for x in range.split('-') ]
-                ranges_out.append('-'.join(c))
-            ans.append('{0}={1}'.format(b[0], ','.join(ranges_out)))
+            ans.append('{0}={1}'.format(b[0], halve_range_str(b[1])))
     return '/'.join(ans)
 
 
@@ -529,8 +551,9 @@ def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
 
 def self_test():
     assert halve_minibatch_size_str('64') == '32'
+    assert halve_minibatch_size_str('64,16:32') == '32,8:16'
     assert halve_minibatch_size_str('1') == '1'
-    assert halve_minibatch_size_str('128=64/256=40,80-100') == '128=32/256=20,40-50'
+    assert halve_minibatch_size_str('128=64/256=40,80:100') == '128=32/256=20,40:50'
     assert validate_chunk_width('64')
     assert validate_chunk_width('64,25,128')
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index c1c95b4748f..0826c9f0468 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -365,7 +365,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
 
 
 def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
-                                   right_context, run_opts, minibatch_size_str='256',
+                                   right_context, run_opts, minibatch_size_str='1:256',
                                    wait=False, background_process_handler=None,
                                    get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index b0c963595a1..1396932252a 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -201,6 +201,7 @@ void ModifyChainExampleContext(const NnetChainExample &eg,
                                int32 right_context,
                                const int32 frame_subsampling_factor,
                                NnetChainExample *eg_out) {
+  static bool warned_left = false, warned_right = false;
   int32 min_input_t, max_input_t,
         min_output_t, max_output_t;
   if (!ContainsSingleExample(eg, &min_input_t, &max_input_t,
@@ -208,19 +209,31 @@ void ModifyChainExampleContext(const NnetChainExample &eg,
     KALDI_ERR << "Too late to perform frame selection/context reduction on "
               << "these examples (already merged?)";
   if (left_context != -1) {
-    if (min_input_t > min_output_t - left_context)
-      KALDI_ERR << "You requested --left-context=" << left_context
-                << ", but example only has left-context of "
-                <<  (min_output_t - min_input_t);
+    int32 observed_left_context = min_output_t - min_input_t;
+    if (!warned_left && observed_left_context < left_context) {
+      warned_left = true;
+      KALDI_WARN << "You requested --left-context=" << left_context
+                 << ", but example only has left-context of "
+                 <<  observed_left_context
+                 << " (will warn only once; this may be harmless if "
+          "using any --*left-context-initial options)";
+    }
     min_input_t = std::max(min_input_t, min_output_t - left_context);
   }
   if (right_context != -1) {
-    if (max_input_t < max_output_t + right_context + frame_subsampling_factor - 1)
-      KALDI_ERR << "You requested --right-context=" << right_context
-                << ", but example only has right-context of "
-                <<  (max_input_t - max_output_t - frame_subsampling_factor + 1);
-    max_input_t = std::min(max_input_t, max_output_t + right_context
-                  + frame_subsampling_factor - 1);
+    int32 observed_right_context = max_input_t - max_output_t;
+
+    if (right_context != -1) {
+      if (!warned_right && observed_right_context < right_context) {
+        warned_right = true;
+        KALDI_ERR << "You requested --right-context=" << right_context
+                  << ", but example only has right-context of "
+                  << observed_right_context
+                 << " (will warn only once; this may be harmless if "
+            "using any --*right-context-final options.";
+      }
+      max_input_t = std::min(max_input_t, max_output_t + right_context);
+    }
   }
   FilterExample(eg,
                 min_input_t, max_input_t,
diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc
index 9c91f997e7a..82dee560ff4 100644
--- a/src/chainbin/nnet3-chain-merge-egs.cc
+++ b/src/chainbin/nnet3-chain-merge-egs.cc
@@ -41,10 +41,9 @@ int main(int argc, char *argv[]) {
         "nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ... \n"
         "See also nnet3-chain-copy-egs\n";
 
-    ExampleMergingConfig merging_config;
-    merging_config.minibatch_size = 64;  // change the default for this
-                                         // program.. anyway it will usually be
-                                         // set on the command line.
+
+    ExampleMergingConfig merging_config("64");  // 64 is default minibatch size.
+
     ParseOptions po(usage);
     merging_config.Register(&po);
 
@@ -61,8 +60,9 @@ int main(int argc, char *argv[]) {
     SequentialNnetChainExampleReader example_reader(examples_rspecifier);
     NnetChainExampleWriter example_writer(examples_wspecifier);
 
+    merging_config.ComputeDerived();
     ChainExampleMerger merger(merging_config, &example_writer);
-    while (!example_reader.Done()) {
+    for (; !example_reader.Done(); example_reader.Next()) {
       const NnetChainExample &cur_eg = example_reader.Value();
       merger.AcceptExample(new NnetChainExample(cur_eg));
     }
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index b1c6e60de47..4f9cb4b92b8 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -505,8 +505,8 @@ void ChainExampleMerger::AcceptExample(NnetChainExample *eg) {
     // so use swap to create that without doing any real work.
     std::vector<NnetChainExample> egs_to_merge(minibatch_size);
     for (int32 i = 0; i < minibatch_size; i++) {
-      egs_to_merge[i].Swap(vec[i]);
-      delete vec[i];  // we owned those pointers.
+      egs_to_merge[i].Swap(vec_copy[i]);
+      delete vec_copy[i];  // we owned those pointers.
     }
     WriteMinibatch(&egs_to_merge);
   }
@@ -572,6 +572,7 @@ void ChainExampleMerger::Finish() {
       vec.clear();
     }
   }
+  stats_.PrintStats();
 }
 
 
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 24e68116193..ac782a92805 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -247,14 +247,14 @@ class ChainExampleMerger {
   void AcceptExample(NnetChainExample *a);
 
   // This function announces to the class that the input has finished, so it
-  // should flush out any smaller-sizes minibatches, as dictated by the config.
+  // should flush out any smaller-sized minibatches, as dictated by the config.
   // This will be called in the destructor, but you can call it explicitly when
-  // all the input is done if you want to.
-  // It also prints the stats.
+  // all the input is done if you want to; it won't repeat anything if called
+  // twice.  It also prints the stats.
   void Finish();
 
   // returns a suitable exit status for a program.
-  bool ExitStatus() { return num_egs_written_ > 0; }
+  int32 ExitStatus() { Finish(); return (num_egs_written_ > 0 ? 0 : 1); }
 
   ~ChainExampleMerger() { Finish(); };
  private:
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index bfc67db17be..1e293f588ae 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -30,7 +30,8 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
     opts_(opts),
     den_graph_(den_fst, nnet->OutputDim("output")),
     nnet_(nnet),
-    compiler_(*nnet, opts_.nnet_config.optimize_config),
+    compiler_(*nnet, opts_.nnet_config.optimize_config,
+              opts_.nnet_config.compiler_config),
     num_minibatches_processed_(0) {
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index e5974b46f46..9dc372340be 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -355,7 +355,7 @@ class RandomComponent: public Component {
   // This function is required in testing code and in other places we need
   // consistency in the random number generation (e.g. when optimizing
   // validation-set performance), but check where else we call srand().  You'll
-  // need to call srand as well as making this call.
+  // need to call srand prior to making this call.
   void ResetGenerator() { random_generator_.SeedGpu(); }
  protected:
   CuRand<BaseFloat> random_generator_;
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index debc91b96c9..a7330e772a3 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -499,8 +499,8 @@ void DiscriminativeExampleMerger::AcceptExample(NnetDiscriminativeExample *eg) {
     // so use swap to create that without doing any real work.
     std::vector<NnetDiscriminativeExample> egs_to_merge(minibatch_size);
     for (int32 i = 0; i < minibatch_size; i++) {
-      egs_to_merge[i].Swap(vec[i]);
-      delete vec[i];  // we owned those pointers.
+      egs_to_merge[i].Swap(vec_copy[i]);
+      delete vec_copy[i];  // we owned those pointers.
     }
     WriteMinibatch(&egs_to_merge);
   }
@@ -566,10 +566,10 @@ void DiscriminativeExampleMerger::Finish() {
       vec.clear();
     }
   }
+  stats_.PrintStats();
 }
 
 
 
 } // namespace nnet3
 } // namespace kaldi
-
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index 048ee32c4e8..9d9bba0c906 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -245,14 +245,14 @@ class DiscriminativeExampleMerger {
   void AcceptExample(NnetDiscriminativeExample *a);
 
   // This function announces to the class that the input has finished, so it
-  // should flush out any smaller-sizes minibatches, as dictated by the config.
+  // should flush out any smaller-sized minibatches, as dictated by the config.
   // This will be called in the destructor, but you can call it explicitly when
-  // all the input is done if you want to.
-  // It also prints the stats.
+  // all the input is done if you want to; it won't repeat anything if called
+  // twice.  It also prints the stats.
   void Finish();
 
   // returns a suitable exit status for a program.
-  bool ExitStatus() { return num_egs_written_ > 0; }
+  int32 ExitStatus() { Finish(); return (num_egs_written_ > 0 ? 0 : 1); }
 
   ~DiscriminativeExampleMerger() { Finish(); };
  private:
@@ -281,4 +281,3 @@ class DiscriminativeExampleMerger {
 } // namespace kaldi
 
 #endif // KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
-
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 574ff6fbfae..28578de42fb 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -466,10 +466,12 @@ void UtteranceSplitter::InitSplitForLength() {
       // gaps twice as strongly as overlaps, based on the intuition that
       // completely throwing out frames of data is worse than counting them
       // twice.
-      int32 c = (default_duration > float(u) ? default_duration - u :
-                 2 * (u - default_duration));
-      if (u < max_chunk_size)
+      float c = (default_duration > float(u) ? default_duration - float(u) :
+                 2.0 * (u - default_duration));
+      if (u < max_chunk_size)  // can't fit the largest of the chunks in this
+                               // utterance
         c = std::numeric_limits<float>::max();
+      KALDI_ASSERT(c >= 0);
       costs_for_length[u].push_back(c);
     }
   }
@@ -574,6 +576,10 @@ void UtteranceSplitter::GetChunkSizesForUtterance(
   KALDI_ASSERT(utterance_length >= 0);
   const std::vector<std::vector<int32> > &possible_splits =
       splits_for_length_[utterance_length];
+  if (possible_splits.empty()) {
+    chunk_sizes->clear();
+    return;
+  }
   int32 num_possible_splits = possible_splits.size(),
       randomly_chosen_split = RandInt(0, num_possible_splits - 1);
   *chunk_sizes = possible_splits[randomly_chosen_split];
@@ -693,7 +699,7 @@ void UtteranceSplitter::DistributeRandomly(int32 n,
   std::vector<std::pair<float, int32> > partial_counts;
   int32 total_count = 0;
   for (int32 i = 0; i < size; i++) {
-    float this_count = float(n) / total_magnitude;
+    float this_count = n * float(magnitudes[i]) / total_magnitude;
     // note: cast of float to int32 rounds towards zero (down, in this
     // case, since this_count >= 0).
     int32 this_whole_count = static_cast<int32>(this_count),
@@ -904,11 +910,9 @@ bool ExampleMergingConfig::ParseIntSet(const std::string &str,
   int_set->ranges.resize(split_str.size());
   for (size_t i = 0; i < split_str.size(); i++) {
     std::vector<int32> split_range;
-    // note: because we split on '-', it't not possible to
-    // get negative values in 'split_range'.
-    SplitStringToIntegers(str, "-", false, &split_range);
+    SplitStringToIntegers(split_str[i], ":", false, &split_range);
     if (split_range.size() < 1 || split_range.size() > 2 ||
-        split_range[0] > split_range[1])
+        split_range[0] > split_range.back() || split_range[0] <= 0)
       return false;
     int_set->ranges[i].first = split_range[0];
     int_set->ranges[i].second = split_range.back();
@@ -935,7 +939,7 @@ void ExampleMergingConfig::ComputeDerived() {
 
   rules.resize(minibatch_size_split.size());
   for (size_t i = 0; i < minibatch_size_split.size(); i++) {
-    int32 &minibatch_size = rules[i].first;
+    int32 &eg_size = rules[i].first;
     IntSet &int_set = rules[i].second;
     // 'this_rule' will be either something like "256" or like "64-128,256"
     // (but these two only if  minibatch_size_split.size() == 1, or something with
@@ -948,7 +952,7 @@ void ExampleMergingConfig::ComputeDerived() {
         KALDI_ERR << "Could not parse option --minibatch-size="
                   << minibatch_size;
       }
-      if (!ConvertStringToInteger(rule_split[0], &minibatch_size) ||
+      if (!ConvertStringToInteger(rule_split[0], &eg_size) ||
           !ParseIntSet(rule_split[1], &int_set))
         KALDI_ERR << "Could not parse option --minibatch-size="
                   << minibatch_size;
@@ -957,9 +961,8 @@ void ExampleMergingConfig::ComputeDerived() {
       if (minibatch_size_split.size() != 1) {
         KALDI_ERR << "Could not parse option --minibatch-size="
                   << minibatch_size << " (all rules must have "
-                  << "minibatch-size specified if >1 rule)";
+                  << "eg-size specified if >1 rule)";
       }
-      minibatch_size = 0;
       if (!ParseIntSet(this_rule, &int_set))
         KALDI_ERR << "Could not parse option --minibatch-size="
                   << minibatch_size;
@@ -1035,8 +1038,8 @@ void ExampleMergingStats::DiscardedExamples(int32 example_size,
 
 
 void ExampleMergingStats::PrintStats() const {
-  PrintAggregateStats();
   PrintSpecificStats();
+  PrintAggregateStats();
 }
 
 void ExampleMergingStats::PrintAggregateStats() const {
@@ -1097,7 +1100,7 @@ void ExampleMergingStats::PrintAggregateStats() const {
      << " egs of avg. size " << avg_input_egs_size
      << " into " << num_minibatches << " minibatches, discarding "
      << percent_discarded <<  "% of egs.  Avg minibatch size was "
-     << avg_minibatch_size << ", distinct types of egs/minibatches "
+     << avg_minibatch_size << ", #distinct types of egs/minibatches "
      << "was " << num_distinct_egs_types << "/"
      << num_distinct_minibatch_types;
   KALDI_LOG << os.str();
@@ -1179,8 +1182,8 @@ void ExampleMerger::AcceptExample(NnetExample *eg) {
     // so use swap to create that without doing any real work.
     std::vector<NnetExample> egs_to_merge(minibatch_size);
     for (int32 i = 0; i < minibatch_size; i++) {
-      egs_to_merge[i].Swap(vec[i]);
-      delete vec[i];  // we owned those pointers.
+      egs_to_merge[i].Swap(vec_copy[i]);
+      delete vec_copy[i];  // we owned those pointers.
     }
     WriteMinibatch(egs_to_merge);
   }
@@ -1244,6 +1247,7 @@ void ExampleMerger::Finish() {
       vec.clear();
     }
   }
+  stats_.PrintStats();
 }
 
 
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 66624f69004..021e91959e3 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -197,7 +197,7 @@ class UtteranceSplitter {
 
   ~UtteranceSplitter();
 
-  int32 ExitStatus() { return (total_frames_in_chunks_ > 0); }
+  int32 ExitStatus() { return (total_frames_in_chunks_ > 0 ? 0 : 1); }
 
  private:
 
@@ -283,7 +283,7 @@ class UtteranceSplitter {
 
   const ExampleGenerationConfig &config_;
 
-  // The vector 'split_for_length_' is indexed by the num-frames of a file, and
+  // The vector 'splits_for_length_' is indexed by the num-frames of a file, and
   // gives us a list of alternative splits that we can use if the utternace has
   // that many frames.  For example, if split_for_length[100] = ( (25, 40, 40),
   // (40, 65) ), it means we could either split as chunks of size (25, 40, 40)
@@ -328,10 +328,11 @@ class ExampleMergingConfig {
   std::string minibatch_size;
   std::string discard_partial_minibatches;   // for back-compatibility, not used.
 
-  ExampleMergingConfig(): compress(false),
-                          measure_output_frames("deprecated"),
-                          minibatch_size("256"),
-                          discard_partial_minibatches("deprecated") { }
+  ExampleMergingConfig(const char *default_minibatch_size = "256"):
+      compress(false),
+      measure_output_frames("deprecated"),
+      minibatch_size(default_minibatch_size),
+      discard_partial_minibatches("deprecated") { }
 
   void Register(OptionsItf *po) {
     po->Register("compress", &compress, "If true, compress the output examples "
@@ -344,7 +345,7 @@ class ExampleMergingConfig {
                  "String controlling the minibatch size.  May be just an integer, "
                  "meaning a fixed minibatch size (e.g. --minibatch-size=128). "
                  "May be a list of ranges and values, e.g. --minibatch-size=32,64 "
-                 "or --minibatch-size=16-32,64,128.  All minibatches will be of "
+                 "or --minibatch-size=16:32,64,128.  All minibatches will be of "
                  "the largest size until the end of the input is reached; "
                  "then, increasingly smaller sizes will be allowed.  Only egs "
                  "with the same structure (e.g num-frames) are merged.  You may "
@@ -352,7 +353,7 @@ class ExampleMergingConfig {
                  "(defined as the maximum number of Indexes on any input), in "
                  "the format "
                  "--minibatch-size='eg_size1=mb_sizes1/eg_size2=mb_sizes2', e.g. "
-                 "--minibatch-size=128=64-128,256/256=32-64,128.  Egs are given "
+                 "--minibatch-size=128=64:128,256/256=32:64,128.  Egs are given "
                  "minibatch-sizes based on the specified eg-size closest to "
                  "their actual size.");
   }
@@ -385,9 +386,9 @@ class ExampleMergingConfig {
 
 
  private:
-  // struct IntSet is a representation of something like 16-32,64, which is a
-  // nonempty list of either nonnegative integers or ranges of nonnegative
-  // integers.  Conceptually it represents a set of nonnegative integers.
+  // struct IntSet is a representation of something like 16:32,64, which is a
+  // nonempty list of either positive integers or ranges of positive integers.
+  // Conceptually it represents a set of positive integers.
   struct IntSet {
     // largest_size is the largest integer in any of the ranges (64 in this
     // example).
@@ -489,14 +490,14 @@ class ExampleMerger {
   void AcceptExample(NnetExample *a);
 
   // This function announces to the class that the input has finished, so it
-  // should flush out any smaller-sizes minibatches, as dictated by the config.
+  // should flush out any smaller-sized minibatches, as dictated by the config.
   // This will be called in the destructor, but you can call it explicitly when
-  // all the input is done if you want to.
-  // It also prints the stats.
+  // all the input is done if you want to; it won't repeat anything if called
+  // twice.  It also prints the stats.
   void Finish();
 
   // returns a suitable exit status for a program.
-  bool ExitStatus() { return num_egs_written_ > 0 ? 0 : 1; }
+  int32 ExitStatus() { Finish(); return (num_egs_written_ > 0 ? 0 : 1); }
 
   ~ExampleMerger() { Finish(); };
  private:
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index 19cfb3949ad..0e6918de18d 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -247,8 +247,8 @@ class Nnet {
   void RemoveSomeNodes(const std::vector<int32> &nodes_to_remove);
 
   void ResetGenerators(); // resets random-number generators for all
-  // random components.  You must also set srand() for this to be
-  // effective.
+  // random components.  You must call srand() prior to this call, for this to
+  // be effective.
 
 
   // This function outputs to "config_lines" the lines of a config file.  If you
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index f5687ebbe1e..9e534256e3f 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -28,7 +28,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
                          Nnet *nnet):
     config_(config),
     nnet_(nnet),
-    compiler_(*nnet, config_.optimize_config),
+    compiler_(*nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0) {
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -44,11 +44,11 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
 
   if (config_.read_cache != "") {
     bool binary;
-    Input ki;
-    if (ki.Open(config_.read_cache, &binary)) {
+    try {
+      Input ki(config_.read_cache, &binary);
       compiler_.ReadCache(ki.Stream(), binary);
       KALDI_LOG << "Read computation cache from " << config_.read_cache;
-    } else {
+    } catch (...) {
       KALDI_WARN << "Could not open cached computation. "
                     "Probably this is the first training iteration.";
     }
@@ -164,14 +164,14 @@ void NnetTrainer::UpdateParamsWithMaxChange() {
     if (min_scale < 1.0)
       ostr << "Per-component max-change active on "
            << num_max_change_per_component_applied_per_minibatch
-           << " / " << num_updatable << " updatable Components; "
-           << "smallest factor=" << min_scale << " on "
+           << " / " << num_updatable << " Updatable Components."
+           << "(smallest factor=" << min_scale << " on "
            << component_name_with_min_scale
-           << " with max-change=" << max_change_with_min_scale << '.';
+           << " with max-change=" << max_change_with_min_scale <<"). ";
     if (param_delta > config_.max_param_change)
       ostr << "Global max-change factor was "
            << config_.max_param_change / param_delta
-           << " with max-change=" << config_.max_param_change << '.';
+           << " with max-change=" << config_.max_param_change << ".";
     KALDI_LOG << ostr.str();
   }
   // applies both of the max-change scalings all at once, component by component
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index 70c90267c66..55d3e02ea67 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -42,6 +42,7 @@ struct NnetTrainerOptions {
   BaseFloat max_param_change;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
+  CachingOptimizingCompilerOptions compiler_config;
   NnetTrainerOptions():
       zero_component_stats(true),
       store_component_stats(true),
@@ -79,8 +80,8 @@ struct NnetTrainerOptions {
     // register the optimization options with the prefix "optimization".
     ParseOptions optimization_opts("optimization", opts);
     optimize_config.Register(&optimization_opts);
-
-
+    ParseOptions compiler_opts("compiler", opts);
+    compiler_config.Register(&compiler_opts);
     // register the compute options with the prefix "computation".
     ParseOptions compute_opts("computation", opts);
     compute_config.Register(&compute_opts);
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index efb51f51910..42413114af3 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -191,6 +191,7 @@ bool SelectFromExample(const NnetExample &eg,
                        int32 right_context,
                        int32 frame_shift,
                        NnetExample *eg_out) {
+  static bool warned_left = false, warned_right = false;
   int32 min_input_t, max_input_t,
       min_output_t, max_output_t;
   if (!ContainsSingleExample(eg, &min_input_t, &max_input_t,
@@ -214,21 +215,26 @@ bool SelectFromExample(const NnetExample &eg,
       min_output_t = max_output_t = frame;
     }
   }
-  // There may come a time when we want to remove or make it possible to disable
-  // the error messages below.  The std::max and std::min expressions may seem
-  // unnecessary but are intended to make life easier if and when we do that.
   if (left_context != -1) {
-    if (min_input_t > min_output_t - left_context)
-      KALDI_ERR << "You requested --left-context=" << left_context
-                << ", but example only has left-context of "
-                <<  (min_output_t - min_input_t);
+    if (!warned_left && min_input_t > min_output_t - left_context) {
+      warned_left = true;
+      KALDI_WARN << "You requested --left-context=" << left_context
+                 << ", but example only has left-context of "
+                 <<  (min_output_t - min_input_t)
+                 << " (will warn only once; this may be harmless if "
+          "using any --*left-context-initial options)";
+    }
     min_input_t = std::max(min_input_t, min_output_t - left_context);
   }
   if (right_context != -1) {
-    if (max_input_t < max_output_t + right_context)
-      KALDI_ERR << "You requested --right-context=" << right_context
+    if (!warned_right && max_input_t < max_output_t + right_context) {
+      warned_right = true;
+      KALDI_WARN << "You requested --right-context=" << right_context
                 << ", but example only has right-context of "
-                <<  (max_input_t - max_output_t);
+                <<  (max_input_t - max_output_t)
+                 << " (will warn only once; this may be harmless if "
+            "using any --*right-context-final options.";
+    }
     max_input_t = std::min(max_input_t, max_output_t + right_context);
   }
   FilterExample(eg,
@@ -357,5 +363,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
index 0edf960fdf9..bc4cdfb2941 100644
--- a/src/nnet3bin/nnet3-discriminative-merge-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -41,10 +41,8 @@ int main(int argc, char *argv[]) {
         "nnet3-discriminative-merge-egs --minibatch-size=128 ark:1.degs ark:- | nnet3-discriminative-train ... \n"
         "See also nnet3-discriminative-copy-egs\n";
 
-    ExampleMergingConfig merging_config;
-    merging_config.minibatch_size = 64;  // change the default for this
-                                         // program.. anyway it will usually be
-                                         // set on the command line.
+    ExampleMergingConfig merging_config("64");  // 64 is default minibatch size.
+
     ParseOptions po(usage);
     merging_config.Register(&po);
 
@@ -61,8 +59,9 @@ int main(int argc, char *argv[]) {
     SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
     NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
 
+    merging_config.ComputeDerived();
     DiscriminativeExampleMerger merger(merging_config, &example_writer);
-    while (!example_reader.Done()) {
+    for (; !example_reader.Done(); example_reader.Next()) {
       const NnetDiscriminativeExample &cur_eg = example_reader.Value();
       merger.AcceptExample(new NnetDiscriminativeExample(cur_eg));
     }
@@ -74,6 +73,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
-
diff --git a/src/nnet3bin/nnet3-get-egs-dense-targets.cc b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
index ddcf5f23555..54d607466b5 100644
--- a/src/nnet3bin/nnet3-get-egs-dense-targets.cc
+++ b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
@@ -208,9 +208,6 @@ int main(int argc, char *argv[]) {
     eg_config.ComputeDerived();
     UtteranceSplitter utt_splitter(eg_config);
 
-    if (num_targets <= 0)
-      KALDI_ERR << "--num-targets options is required.";
-
     std::string feature_rspecifier = po.GetArg(1),
         matrix_rspecifier = po.GetArg(2),
         examples_wspecifier = po.GetArg(3);
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 33a65d140f2..081c0a21c7b 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
 
     ExampleMerger merger(merging_config, &example_writer);
 
-    while (!example_reader.Done()) {
+    for (; !example_reader.Done(); example_reader.Next()) {
       const NnetExample &cur_eg = example_reader.Value();
       merger.AcceptExample(new NnetExample(cur_eg));
     }

From 8b410f1d27d85faed8532592bfc273864d517664 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 1 Jan 2017 17:47:55 -0500
Subject: [PATCH 042/213] Change how descriptors behave in nnet3 (Issue #1125)

---
 src/nnet3/nnet-common.h      |  6 ++++++
 src/nnet3/nnet-descriptor.cc | 23 ++++++++++++-----------
 src/nnet3/nnet-descriptor.h  |  2 +-
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index cb5d8c3b944..9134e2545de 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -62,6 +62,12 @@ struct Index {
   Index operator + (const Index &other) const {
     return Index(n+other.n, t+other.t, x+other.x);
   }
+  Index &operator += (const Index &other) {
+    n += other.n;
+    t += other.t;
+    x += other.x;
+    return *this;
+  }
 
   void Write(std::ostream &os, bool binary) const;
 
diff --git a/src/nnet3/nnet-descriptor.cc b/src/nnet3/nnet-descriptor.cc
index 162a55b8149..d02bc49a5af 100644
--- a/src/nnet3/nnet-descriptor.cc
+++ b/src/nnet3/nnet-descriptor.cc
@@ -107,9 +107,9 @@ void OffsetForwardingDescriptor::GetNodeDependencies(
 }
 
 Cindex OffsetForwardingDescriptor::MapToInput(const Index &ind) const {
-  Cindex answer = src_->MapToInput(ind);
-  answer.second = answer.second + offset_;
-  return answer;
+  Index ind_mod(ind);
+  ind_mod += offset_;
+  return src_->MapToInput(ind_mod);
 }
 
 
@@ -173,12 +173,13 @@ void RoundingForwardingDescriptor::GetNodeDependencies(
 
 Cindex RoundingForwardingDescriptor::MapToInput(const Index &ind) const {
   KALDI_ASSERT(t_modulus_ >= 1);
-  Cindex ans = src_->MapToInput(ind);
-  int32 mod = ans.second.t % t_modulus_;
+  Index ind_mod(ind);
+  // unfortunately doing "mathematical" modulus is a bit painful in C.
+  int32 mod = ind_mod.t % t_modulus_;
   if (mod < 0)
     mod += t_modulus_;
-  ans.second.t -= mod;
-  return ans;
+  ind_mod.t -= mod;
+  return src_->MapToInput(ind_mod);
 }
 
 ForwardingDescriptor *RoundingForwardingDescriptor::Copy() const {
@@ -199,15 +200,15 @@ void ReplaceIndexForwardingDescriptor::GetNodeDependencies(
 }
 
 Cindex ReplaceIndexForwardingDescriptor::MapToInput(const Index &ind) const {
-  Cindex ans = src_->MapToInput(ind);
+  Index ind_mod(ind);
   switch (variable_name_) {
-    case kT: ans.second.t = value_; break;
-    case kX: ans.second.x = value_; break;
+    case kT: ind_mod.t = value_; break;
+    case kX: ind_mod.x = value_; break;
     default:  // kN or any other value is not allowed (doesn't make sense
       // to change the minibatch index in this way).
       KALDI_ERR << "Invalid variable name";
   }
-  return ans;
+  return src_->MapToInput(ind_mod);
 }
 
 ForwardingDescriptor *ReplaceIndexForwardingDescriptor::Copy() const {
diff --git a/src/nnet3/nnet-descriptor.h b/src/nnet3/nnet-descriptor.h
index 93650e84307..e2d2c41772d 100644
--- a/src/nnet3/nnet-descriptor.h
+++ b/src/nnet3/nnet-descriptor.h
@@ -70,7 +70,7 @@ namespace nnet3 {
 ;; arguments
 <descriptor>  ::=   Switch(<descriptor>, <descriptor> [, <descriptor> ...])
 ;; For use in clockwork RNNs or similar, Round() rounds the time-index t of the
-;; requested Index to the next-lowest multiple of the integer <t-modulus>
+;; requested Index to the next-lowest multiple of the integer <t-modulus>,
 ;; and evaluates the input argument for the resulting Index.
 <descriptor>  ::=   Round(<descriptor>, <t-modulus>)  ;; <t-modulus> is an integer
 ;; ReplaceIndex replaces some <variable-name> (t or x) in the requested Index

From c353dc95f27751e771a0b1f5b0360a55ae18f1d7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 1 Jan 2017 20:18:30 -0500
Subject: [PATCH 043/213] Various code fixes and additional diagnostics

---
 src/chainbin/nnet3-chain-compute-prob.cc |   2 -
 src/nnet3/nnet-chain-diagnostics.cc      |   2 +-
 src/nnet3/nnet-diagnostics.cc            |   2 +-
 src/nnet3/nnet-diagnostics.h             |   5 +-
 src/nnet3/nnet-optimize.cc               | 104 ++++++++++++++++++-----
 src/nnet3/nnet-optimize.h                |  42 +++++----
 6 files changed, 116 insertions(+), 41 deletions(-)

diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc
index 7f9d688777a..830f1e8cee4 100644
--- a/src/chainbin/nnet3-chain-compute-prob.cc
+++ b/src/chainbin/nnet3-chain-compute-prob.cc
@@ -84,5 +84,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index b6b39816337..54d73a6ead3 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -32,7 +32,7 @@ NnetChainComputeProb::NnetChainComputeProb(
     chain_config_(chain_config),
     den_graph_(den_fst, nnet.OutputDim("output")),
     nnet_(nnet),
-    compiler_(nnet, nnet_config_.optimize_config),
+    compiler_(nnet, nnet_config_.optimize_config, nnet_config_.compiler_config),
     deriv_nnet_(NULL),
     num_minibatches_processed_(0) {
   if (nnet_config_.compute_deriv) {
diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc
index e7adeffeb09..d7de17682da 100644
--- a/src/nnet3/nnet-diagnostics.cc
+++ b/src/nnet3/nnet-diagnostics.cc
@@ -28,7 +28,7 @@ NnetComputeProb::NnetComputeProb(const NnetComputeProbOptions &config,
     config_(config),
     nnet_(nnet),
     deriv_nnet_(NULL),
-    compiler_(nnet),
+    compiler_(nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0) {
   if (config_.compute_deriv) {
     deriv_nnet_ = new Nnet(nnet_);
diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h
index 298548857dd..fd2ceb1df9e 100644
--- a/src/nnet3/nnet-diagnostics.h
+++ b/src/nnet3/nnet-diagnostics.h
@@ -46,6 +46,7 @@ struct NnetComputeProbOptions {
   bool compute_accuracy;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
+  CachingOptimizingCompilerOptions compiler_config;
   NnetComputeProbOptions():
       debug_computation(false),
       compute_deriv(false),
@@ -60,7 +61,9 @@ struct NnetComputeProbOptions {
     // register the optimization options with the prefix "optimization".
     ParseOptions optimization_opts("optimization", opts);
     optimize_config.Register(&optimization_opts);
-
+    // register the compiler options with the prefix "compiler".
+    ParseOptions compiler_opts("compiler", opts);
+    compiler_config.Register(&compiler_opts);
     // register the compute options with the prefix "computation".
     ParseOptions compute_opts("computation", opts);
     compute_config.Register(&compute_opts);
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 54ebf17edc7..f024d68aed7 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -18,6 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <iomanip>
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-optimize-utils.h"
 #include "base/timer.h"
@@ -546,6 +547,24 @@ size_t ComputationRequestHasher::operator() (const ComputationRequest *cr) const
   return ans;
 }
 
+
+CachingOptimizingCompiler::CachingOptimizingCompiler(
+    const Nnet &nnet,
+    const CachingOptimizingCompilerOptions config):
+    nnet_(nnet), config_(config),
+    seconds_taken_total_(0.0), seconds_taken_compile_(0.0),
+    seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0),
+    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { }
+
+CachingOptimizingCompiler::CachingOptimizingCompiler(
+    const Nnet &nnet,
+    const NnetOptimizeOptions &opt_config,
+    const CachingOptimizingCompilerOptions config):
+    nnet_(nnet), config_(config), opt_config_(opt_config),
+    seconds_taken_total_(0.0), seconds_taken_compile_(0.0),
+    seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0),
+    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { }
+
 void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
                                             const NnetComputation *computation) {
   if (computation_cache_.size() == config_.cache_capacity) {
@@ -615,12 +634,33 @@ CachingOptimizingCompiler::~CachingOptimizingCompiler() {
     delete itr->first;
     delete itr->second.first;
   }
-  KALDI_LOG << seconds_taken_ << " seconds taken in nnet3 compilation";
+  std::ostringstream os;
+  double seconds_taken_misc = seconds_taken_total_ - seconds_taken_compile_
+      - seconds_taken_optimize_ - seconds_taken_expand_
+      - seconds_taken_check_ - seconds_taken_indexes_;
+  os << std::setprecision(3) << seconds_taken_total_
+     << " seconds taken in nnet3 compilation total (breakdown: "
+     << seconds_taken_compile_ << " compilation, "
+     << seconds_taken_optimize_ << " optimization, "
+     << seconds_taken_expand_ << " shortcut expansion, "
+     << seconds_taken_check_ << " checking, "
+     << seconds_taken_indexes_ << " computing indexes, "
+     << seconds_taken_misc << " misc.)";
+  KALDI_LOG << os.str();
+  // note: the leftover amount is misc things like hashing and == comparisons on
+  // computation-requests, and calling RequestIsDecomposable().
 }
 
 const NnetComputation* CachingOptimizingCompiler::Compile(
     const ComputationRequest  &in_request) {
   Timer timer;
+  const NnetComputation *ans = CompileInternal(in_request);
+  seconds_taken_total_ += timer.Elapsed();
+  return ans;
+}
+
+const NnetComputation* CachingOptimizingCompiler::CompileInternal(
+    const ComputationRequest  &in_request) {
   const NnetComputation *ans;
   // find computation in the cache
   CacheType::iterator cit = computation_cache_.find(&in_request);
@@ -632,7 +672,6 @@ const NnetComputation* CachingOptimizingCompiler::Compile(
     UpdateAccessQueue(cit);
     ans = computation;
   }
-  seconds_taken_ += timer.Elapsed();
   return ans;
 }
 
@@ -658,7 +697,12 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
   // There may be situations where we'd prefer not to keep it, for speed.
   CompilerOptions opts;
   NnetComputation *computation = new NnetComputation;
-  compiler.CreateComputation(opts, computation);
+
+  {
+    Timer timer;
+    compiler.CreateComputation(opts, computation);
+    seconds_taken_compile_ += timer.Elapsed();
+  }
 
   int32 verbose_cutoff = 4;
   if (GetVerboseLevel() >= verbose_cutoff) {
@@ -669,28 +713,43 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
     computation->Print(os2, nnet_);
     KALDI_LOG << "Generated computation is: " << os2.str();
   }
-  { // some checking.  Note: there may be a time when we might
-    // prefer not do to this checking.
+  { // some checking.  Note: there may come a time when we might
+    // prefer to disable this checking.
+    Timer timer;
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
     check_config.check_rewrite = true;
     ComputationChecker checker(check_config, nnet_, *computation);
     checker.Check();
+    seconds_taken_check_ += timer.Elapsed();
   }
-  Optimize(opt_config_, nnet_,
-           MaxOutputTimeInRequest(request),
-           computation);
+
+  {
+    Timer timer;
+    Optimize(opt_config_, nnet_,
+             MaxOutputTimeInRequest(request),
+             computation);
+    seconds_taken_optimize_ += timer.Elapsed();
+  }
+
+
   if (GetVerboseLevel() >= verbose_cutoff) {
     std::ostringstream os;
     computation->Print(os, nnet_);
     KALDI_LOG << "Optimized computation is: " << os.str();
   }
   {  // check the computation again.
+    Timer timer;
     CheckComputationOptions check_config;
     ComputationChecker checker(check_config, nnet_, *computation);
     checker.Check();
+    seconds_taken_check_ += timer.Elapsed();
+  }
+  {
+    Timer timer;
+    computation->ComputeCudaIndexes();
+    seconds_taken_indexes_ += timer.Elapsed();
   }
-  computation->ComputeCudaIndexes();
   return computation;
 }
 
@@ -705,11 +764,12 @@ const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut(
   if (!RequestIsDecomposable(request, &mini_request, &num_n_values))
     return NULL;
 
-  // by invoking Compile() on the mini request, we go through the same
-  // caching process as for any externally requested computation.
-  // note: this pointer is not being 'given to us'... it's owned in
-  // the cache.
-  const NnetComputation *mini_computation = Compile(mini_request);
+  // By invoking CompileInternal() on the mini request, we go through the same
+  // caching process as for any externally requested computation.  [the only
+  // difference from Compile() is that it doesn't call the timer code; this
+  // avoids double-counting the time taken.]  This pointer will not have to be
+  // deleted by this function; it's owned by the class, in the cache.
+  const NnetComputation *mini_computation = CompileInternal(mini_request);
 
   // note: by default we always create debug_info, even in regular compilation.
   // (e.g. it defaults to true in CompilerOptions).  If it really seems to be a
@@ -719,11 +779,17 @@ const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut(
 
   NnetComputation *ans = new NnetComputation();
 
-  ExpandComputation(nnet_, request.misc_info, *mini_computation,
-                    need_debug_info, num_n_values, ans);
-
-  ans->ComputeCudaIndexes();
-
+  {
+    Timer timer;
+    ExpandComputation(nnet_, request.misc_info, *mini_computation,
+                      need_debug_info, num_n_values, ans);
+    seconds_taken_expand_ += timer.Elapsed();
+  }
+  {
+    Timer timer;
+    ans->ComputeCudaIndexes();
+    seconds_taken_indexes_ += timer.Elapsed();
+  }
   return ans;
 }
 
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index ab0721e802a..bbe5269c982 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -206,16 +206,13 @@ class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
                             const CachingOptimizingCompilerOptions config =
-                            CachingOptimizingCompilerOptions()):
-      nnet_(nnet), config_(config), seconds_taken_(0.0) { }
+                            CachingOptimizingCompilerOptions());
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
                             const CachingOptimizingCompilerOptions config =
-                            CachingOptimizingCompilerOptions()):
-      nnet_(nnet), config_(config), opt_config_(opt_config),
-      seconds_taken_(0.0) { }
+                            CachingOptimizingCompilerOptions());
 
   ~CachingOptimizingCompiler();
   /// Does the compilation and returns a const pointer to
@@ -226,11 +223,17 @@ class CachingOptimizingCompiler {
   void ReadCache(std::istream &is, bool binary);
   void WriteCache(std::ostream &os, bool binary) const;
  private:
-  // This function, called from Compile(), is called when a ComputationRequest
-  // has been determined not to have already been cached.  It otherwise has the
-  // same interface as Compile(), but assumes that there is nothing cached for
-  // this computation as yet.  It compiles the computation and takes care of
-  // caching it.
+
+  // This function just implements the work of Compile(); it's made a separate
+  // function for the convenience of the timer code, to avoid it being called
+  // twice (we also call this function directly from inside the class).
+  const NnetComputation* CompileInternal(const ComputationRequest &request);
+
+  // This function, called from CompileInternal(), is called when a
+  // ComputationRequest has been determined not to have already been cached.  It
+  // otherwise has the same interface as CompileInternal(), but assumes that
+  // there is nothing cached for this computation as yet.  It compiles the
+  // computation and takes care of caching it.
   const NnetComputation* CompileAndCache(const ComputationRequest &request);
 
 
@@ -274,13 +277,18 @@ class CachingOptimizingCompiler {
                         ComputationRequestPtrEqual> CacheType;
   CacheType computation_cache_;
 
-  // time spent in compilation-- for diagnostic messages
-  double seconds_taken_;
-
-  // This function updates the computation cache. It is called within Compile().
-  // It takes ownership of the pointers.  It inserts the request at the end of
-  // the queue, and purges the least-recently-accessed request from the queue and
-  // the cache if the capacity is reached.
+  // seconds spent in various phases of compilation-- for diagnostic messages
+  double seconds_taken_total_;
+  double seconds_taken_compile_;
+  double seconds_taken_optimize_;
+  double seconds_taken_expand_;
+  double seconds_taken_check_;
+  double seconds_taken_indexes_;
+
+  // This function updates the computation cache. It is called within
+  // CompileInternal().  It takes ownership of the pointers.  It inserts the
+  // request at the end of the queue, and purges the least-recently-accessed
+  // request from the queue and the cache if the capacity is reached.
   void UpdateCache(const ComputationRequest *request,
                    const NnetComputation *computation);
   // This function updates the recently accessed queue.

From d3eaea0f1ba67bf69777221fa10b58b9c5dd6067 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 4 Jan 2017 20:10:44 -0500
Subject: [PATCH 044/213] Various unrelated fixes to nnet3 code.

---
 src/nnet3/nnet-derivative-test.cc          |  2 +-
 src/nnet3/nnet-optimize-utils.cc           | 34 ++++++----
 src/nnet3/nnet-optimize.cc                 | 73 +++++++++++-----------
 src/nnet3/nnet-optimize.h                  |  5 ++
 src/nnet3bin/nnet3-latgen-faster-looped.cc | 14 +++--
 src/nnet3bin/nnet3-latgen-faster.cc        | 14 +++--
 6 files changed, 84 insertions(+), 58 deletions(-)

diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index 4289b577a25..1f9e61e2b2a 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -416,8 +416,8 @@ void UnitTestNnetInputDerivatives() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  // SetVerboseLevel(4);
 
+  // SetVerboseLevel(4);
 
   for (kaldi::int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 41f3acb3916..adcd5fe22f0 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -1984,14 +1984,14 @@ static void FindNumLeadingAndTrailingNegatives(const std::vector<int32> &vec,
   // at least one nonnegative number.
   while (*ptr2 < 0)
     ptr2--;
-  KALDI_ASSERT(ptr2 != begin);  // would be code error.
+  KALDI_ASSERT(ptr2 >= begin);  // or would be code error.
   *num_trailing_negatives = end - 1 - ptr2;
 }
 
 // This function, called from SnipRowOps, is called when it encounters commands
-// of type kCopyRows or kAddRows; it modifies such commands when the indexes
-// have leading or trailing -1's,h, to make them operate on a smaller submatrix.
-// It returns true if it made a change, and false otherwise.
+// of type kAddRows; it modifies such commands when the indexes have leading or
+// trailing -1's,h, to make them operate on a smaller submatrix.  It returns
+// true if it made a change, and false otherwise.
 static bool SnipSingleRowOp(NnetComputation *computation,
                             int32 command_index) {
   NnetComputation::Command &c = computation->commands[command_index];
@@ -2010,12 +2010,16 @@ static bool SnipSingleRowOp(NnetComputation *computation,
   std::vector<int32> new_indexes(indexes.begin() + num_leading_negatives,
                                  indexes.begin() + num_leading_negatives +
                                  new_num_rows);
+  KALDI_ASSERT(new_indexes.back() >= 0);    // TEMP
   c.arg3 = computation->indexes.size();
   computation->indexes.push_back(std::vector<int32>());
   computation->indexes.back().swap(new_indexes);
   c.arg1 = computation->NewSubMatrix(c.arg1,
                                      num_leading_negatives, new_num_rows,
                                      0, -1);
+  if (new_num_rows == 15) {
+    KALDI_LOG << "HERE"; // TEMP
+  }
   return true;  // made a change.
 }
 
@@ -2059,9 +2063,9 @@ static void FindNumLeadingAndTrailingNegatives(
 
 
 // This function, called from SnipRowOps, is called when it encounters commands
-// of type kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti or kCopyToRowsMulti;
-// have leading or trailing (-1,-1) pairs, to make them operate on a smaller
-// submatrix.  It returns true if it made a change, and false otherwise.
+// of type kAddRowsMulti, kAddToRowsMulti, or kCopyToRowsMulti; have leading or
+// trailing (-1,-1) pairs, to make them operate on a smaller submatrix.  It
+// returns true if it made a change, and false otherwise.
 static bool SnipMultiRowOp(NnetComputation *computation,
                            int32 command_index) {
   NnetComputation::Command &c = computation->commands[command_index];
@@ -2093,7 +2097,7 @@ static bool SnipMultiRowOp(NnetComputation *computation,
 
 
 /*
-  This function, used in SnipRangeRowOp(), finds the number of leading, and
+  This function, used in SnipRangeRowOp(), finds the number of leading and
   trailing values in a vector of pairs of integers, that are the same (i.e.
   pairs of the form (x, x) for any x.  [This is how we represent an empty
   range, which is a kind of no-op, in commands of kCopyRowRanges or
@@ -2172,14 +2176,19 @@ bool SnipRowOps(NnetComputation *computation) {
     // non-const because we'll be changing it.
     NnetComputation::Command &c = computation->commands[command_index];
 
+    // note: we can't do the snipping for commands of type case kCopyRows and case
+    // kCopyRowsMulti, because the -1's aren't a pure no-op; they have the
+    // meaning of setting the destination value to zero, so we can't prune
+    // them away.
+
     switch (c.command_type) {
-      case kCopyRows: case kAddRows: {
+      case kAddRows: {
         if (SnipSingleRowOp(computation, command_index))
           ans = true;
         break;
       }
       case kAddRowsMulti: case kAddToRowsMulti:
-      case kCopyRowsMulti: case kCopyToRowsMulti: {
+      case kCopyToRowsMulti: {
         if (SnipMultiRowOp(computation, command_index))
           ans = true;
         break;
@@ -2405,8 +2414,9 @@ void ComputationExpander::ExpandRowsCommand(
       num_n_values = num_n_values_,
       new_s1_size = expanded_computation_->submatrices[s1].num_rows,
       new_s2_size = expanded_computation_->submatrices[s2].num_rows;
-  KALDI_ASSERT(old_size % 2 == 0 &&
-               old_size == computation_.submatrices[s1].num_rows);
+
+  KALDI_ASSERT(old_size == computation_.submatrices[s1].num_rows);
+
   new_indexes.resize(new_s1_size, -1);
 
   for (int32 i1 = 0; i1 < old_size; i1++) {
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index f024d68aed7..fcb0568dd5c 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -437,7 +437,9 @@ void Optimize(const NnetOptimizeOptions &config,
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, *computation, true);
 
-  { // Call LimitDerivativeTimes().
+  { // Call LimitDerivativeTimes(); it's important that this
+    // should come before other optimizations (search for "insist" in
+    // nnet-optimize-utils.cc for the reasons).
     // this will do nothing unless --min-deriv-time or --max-deriv-time
     // or --max-deriv-time-relative was set.
     int32 max_deriv_time = config.max_deriv_time;
@@ -448,18 +450,18 @@ void Optimize(const NnetOptimizeOptions &config,
                          max_deriv_time, computation);
   }
 
-  if (GetVerboseLevel() >= 4)
+  if (GetVerboseLevel() >= 3)
     CheckComputation(nnet, *computation, true);
 
   if (config.optimize && config.consolidate_model_update)
     ConsolidateModelUpdate(nnet, computation);
 
-  if (GetVerboseLevel() >= 4)
+   if (GetVerboseLevel() >= 3)
     CheckComputation(nnet, *computation, true);
 
   if (config.optimize && config.convert_addition) {
     ConvertAdditionToAssignment(nnet, computation);
-    if (GetVerboseLevel() >= 4)
+    if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, true);
   }
 
@@ -467,20 +469,19 @@ void Optimize(const NnetOptimizeOptions &config,
       (config.remove_assignments || config.backprop_in_place ||
        config.propagate_in_place)) {
     VariableMergingOptimization(config, nnet, computation);
-    if (GetVerboseLevel() >= 4)
+    if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
   }
 
-  if (config.optimize && config.optimize_row_ops) {
-    if (ReplaceRowWithMatrixOps(computation)) {
-      // if anything was changed...
-
-      // We have to call RenumberComputation() to get rid of any removed
-      // indexes... actually this could be a little wasteful, but unfortunately
-      // it doesn't seem like we'd otherwise be doing any renumbering past this
-      // point.
+  if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) {
+    bool must_renumber = false;
+    if (config.snip_row_ops && SnipRowOps(computation))
+      must_renumber = true;
+    if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation))
+      must_renumber = true;
+    if (must_renumber) {
       RenumberComputation(computation);
-      if (GetVerboseLevel() >= 4)
+      if (GetVerboseLevel() >= 3)
         CheckComputation(nnet, *computation, false);
     }
   }
@@ -488,13 +489,13 @@ void Optimize(const NnetOptimizeOptions &config,
 
   if (config.optimize && config.initialize_undefined) {
     RemoveUnnecessaryZeroing(nnet, computation);
-    if (GetVerboseLevel() >= 4)
-      CheckComputation(nnet, *computation, false);
+    if (GetVerboseLevel() >= 3)
+    CheckComputation(nnet, *computation, false);
   }
 
   if (config.optimize && config.move_sizing_commands) {
     MoveSizingCommands(nnet, computation);
-    if (GetVerboseLevel() >= 4)
+    if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
   }
 
@@ -503,7 +504,7 @@ void Optimize(const NnetOptimizeOptions &config,
   // because it's necessary for looped computation to run.
   if (config.optimize_looped_computation){
     OptimizeLoopedComputation(nnet, computation);
-    if (GetVerboseLevel() >= 4)
+    if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
   }
 
@@ -513,7 +514,7 @@ void Optimize(const NnetOptimizeOptions &config,
     // would be correct in that case, as written.  In any case the performance
     // benefit is tiny.
     RemoveUnnecessaryAllocation(nnet, computation);
-    if (GetVerboseLevel() >= 4)
+    if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
   }
 
@@ -526,7 +527,7 @@ void Optimize(const NnetOptimizeOptions &config,
   if (config.optimize_looped_computation)
     FixGotoLabel(computation);
 
-  if (GetVerboseLevel() >= 4)
+  if (GetVerboseLevel() >= 3)
     CheckComputation(nnet, *computation, false);
 }
 
@@ -634,21 +635,23 @@ CachingOptimizingCompiler::~CachingOptimizingCompiler() {
     delete itr->first;
     delete itr->second.first;
   }
-  std::ostringstream os;
-  double seconds_taken_misc = seconds_taken_total_ - seconds_taken_compile_
-      - seconds_taken_optimize_ - seconds_taken_expand_
-      - seconds_taken_check_ - seconds_taken_indexes_;
-  os << std::setprecision(3) << seconds_taken_total_
-     << " seconds taken in nnet3 compilation total (breakdown: "
-     << seconds_taken_compile_ << " compilation, "
-     << seconds_taken_optimize_ << " optimization, "
-     << seconds_taken_expand_ << " shortcut expansion, "
-     << seconds_taken_check_ << " checking, "
-     << seconds_taken_indexes_ << " computing indexes, "
-     << seconds_taken_misc << " misc.)";
-  KALDI_LOG << os.str();
-  // note: the leftover amount is misc things like hashing and == comparisons on
-  // computation-requests, and calling RequestIsDecomposable().
+  if (seconds_taken_total_ > 0.0) {
+    std::ostringstream os;
+    double seconds_taken_misc = seconds_taken_total_ - seconds_taken_compile_
+        - seconds_taken_optimize_ - seconds_taken_expand_
+        - seconds_taken_check_ - seconds_taken_indexes_;
+    os << std::setprecision(3) << seconds_taken_total_
+       << " seconds taken in nnet3 compilation total (breakdown: "
+       << seconds_taken_compile_ << " compilation, "
+       << seconds_taken_optimize_ << " optimization, "
+       << seconds_taken_expand_ << " shortcut expansion, "
+       << seconds_taken_check_ << " checking, "
+       << seconds_taken_indexes_ << " computing indexes, "
+       << seconds_taken_misc << " misc.)";
+    KALDI_LOG << os.str();
+    // note: the leftover amount is misc things like hashing and == comparisons on
+    // computation-requests, and calling RequestIsDecomposable().
+  }
 }
 
 const NnetComputation* CachingOptimizingCompiler::Compile(
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index bbe5269c982..538dde2bbc1 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -48,6 +48,7 @@ struct NnetOptimizeOptions {
   int32 min_deriv_time;
   int32 max_deriv_time;
   int32 max_deriv_time_relative;
+  bool snip_row_ops;
   // optimize_looped_computation is a 'hidden config' not available from
   // the command line; it's set to true to enable the optimization for
   // looped computation that turns a linear computation into a loop.
@@ -69,6 +70,7 @@ struct NnetOptimizeOptions {
       min_deriv_time(std::numeric_limits<int32>::min()),
       max_deriv_time(std::numeric_limits<int32>::max()),
       max_deriv_time_relative(std::numeric_limits<int32>::max()),
+      snip_row_ops(true),
       optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
@@ -118,6 +120,9 @@ struct NnetOptimizeOptions {
                    "variable.  If set, it is equivalent to setting the "
                    "--max-deriv-time to this value plus the largest 't' value "
                    "in any 'output' node of the computation request.");
+    opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to "
+                   "disable an optimization that reduces the size of certain "
+                   "per-row operations");
   }
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &os, bool binary) const;
diff --git a/src/nnet3bin/nnet3-latgen-faster-looped.cc b/src/nnet3bin/nnet3-latgen-faster-looped.cc
index 9ad20fd8764..6e6f5af4410 100644
--- a/src/nnet3bin/nnet3-latgen-faster-looped.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-looped.cc
@@ -183,7 +183,7 @@ int main(int argc, char *argv[]) {
                   &lattice_writer,
                   &like)) {
             tot_like += like;
-            frame_count += features.NumRows();
+            frame_count += nnet_decodable.NumFramesReady();
             num_success++;
           } else num_fail++;
         }
@@ -241,20 +241,24 @@ int main(int argc, char *argv[]) {
                 &alignment_writer, &words_writer, &compact_lattice_writer,
                 &lattice_writer, &like)) {
           tot_like += like;
-          frame_count += features.NumRows();
+          frame_count += nnet_decodable.NumFramesReady();
           num_success++;
         } else num_fail++;
       }
     }
 
+    kaldi::int64 input_frame_count =
+        frame_count * decodable_opts.frame_subsampling_factor;
+
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
+              << (elapsed * 100.0 / input_frame_count);
     KALDI_LOG << "Done " << num_success << " utterances, failed for "
               << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
+    KALDI_LOG << "Overall log-likelihood per frame is "
+              << (tot_like / frame_count) << " over "
+              << frame_count <<" frames.";
 
     delete word_syms;
     if (num_success != 0) return 0;
diff --git a/src/nnet3bin/nnet3-latgen-faster.cc b/src/nnet3bin/nnet3-latgen-faster.cc
index 5a090acb5b5..6bd5cd7c453 100644
--- a/src/nnet3bin/nnet3-latgen-faster.cc
+++ b/src/nnet3bin/nnet3-latgen-faster.cc
@@ -177,7 +177,7 @@ int main(int argc, char *argv[]) {
                   &lattice_writer,
                   &like)) {
             tot_like += like;
-            frame_count += features.NumRows();
+            frame_count += nnet_decodable.NumFramesReady();
             num_success++;
           } else num_fail++;
         }
@@ -236,20 +236,24 @@ int main(int argc, char *argv[]) {
                 &alignment_writer, &words_writer, &compact_lattice_writer,
                 &lattice_writer, &like)) {
           tot_like += like;
-          frame_count += features.NumRows();
+          frame_count += nnet_decodable.NumFramesReady();
           num_success++;
         } else num_fail++;
       }
     }
 
+    kaldi::int64 input_frame_count =
+        frame_count * decodable_opts.frame_subsampling_factor;
+
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
+              << (elapsed * 100.0 / input_frame_count);
     KALDI_LOG << "Done " << num_success << " utterances, failed for "
               << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
+    KALDI_LOG << "Overall log-likelihood per frame is "
+              << (tot_like / frame_count) << " over "
+              << frame_count << " frames.";
 
     delete word_syms;
     if (num_success != 0) return 0;

From 7da1bb1c0d333444f176826a8596021ac36b1739 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 5 Jan 2017 00:40:46 -0500
Subject: [PATCH 045/213] Add code and scripts for LSTM with bounded activation

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 178 +++++-------------
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   1 -
 src/nnet3/nnet-general-component.cc           |  43 +++--
 src/nnet3/nnet-general-component.h            |  19 +-
 src/nnet3/nnet-test-utils.cc                  |   3 +
 5 files changed, 93 insertions(+), 151 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 7c5f262a7f5..45a425dbc60 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -56,6 +56,9 @@ def check_configs(self):
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
 
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
         for key in ['self-repair-scale-nonlinearity']:
             if self.config[key] < 0.0 or self.config[key] > 1.0:
                 raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))
@@ -269,6 +272,9 @@ def check_configs(self):
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
 
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
@@ -444,127 +450,6 @@ def generate_lstm_config(self):
 
         return configs
 
-# Same as the LSTMP layer except that the matrix multiplications are combined
-# we probably keep only version after experimentation. One year old experiments
-# show that this version is slightly worse and might require some tuning
-class XconfigLstmpcLayer(XconfigLstmpLayer):
-    def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "lstmpc-layer"
-        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
-
-    # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
-        # assign some variables to reduce verbosity
-        name = self.name
-        # in the below code we will just call descriptor_strings as descriptors for conciseness
-        input_dim = self.descriptors['input']['dim']
-        input_descriptor = self.descriptors['input']['final-string']
-        cell_dim = self.config['cell-dim']
-        rec_proj_dim = self.config['recurrent-projection-dim']
-        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
-        delay = self.config['delay']
-
-        repair_nonlin = self.config['self-repair-scale-nonlinearity']
-        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
-        bptrunc_str = ("clipping-threshold={0}"
-                      " zeroing-threshold={1}"
-                      " zeroing-interval={2}"
-                      " recurrence-interval={3}"
-                      "".format(self.config['clipping-threshold'],
-                                self.config['zeroing-threshold'],
-                                self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
-        # Natural gradient per element scale parameters
-        # TODO: decide if we want to keep exposing these options
-        if re.search('param-mean', ng_per_element_scale_options) is None and \
-           re.search('param-stddev', ng_per_element_scale_options) is None:
-           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
-        pes_str = ng_per_element_scale_options
-
-        configs = []
-        # naming convention
-        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
-        configs.append("### Begin LTSM layer '{0}'".format(name))
-        configs.append("# Full W_ifoc* matrix")
-        configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str))
-        configs.append("# note : the cell outputs pass through a diagonal matrix")
-
-        # we will not combine the diagonal matrix operations as one of these has a different delay
-        configs.append("# note : the cell outputs pass through a diagonal matrix")
-        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-
-        configs.append("# Defining the non-linearities")
-        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-
-        configs.append("# Defining the components for other cell computations")
-        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
-
-        # c1_t and c2_t defined below
-        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
-        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
-        rec_connection = '{0}.rp_t'.format(name)
-
-        component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
-
-
-        offset = 0
-        component_nodes.append("# i_t")
-        component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor))
-        component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
-
-        component_nodes.append("# f_t")
-        component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
-        component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
-
-        component_nodes.append("# o_t")
-        component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
-        component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
-
-        component_nodes.append("# h_t")
-        component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
-
-        component_nodes.append("# g_t")
-        component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
-
-
-        configs.append("# parts of c_t")
-        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
-        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
-
-        configs.append("# m_t")
-        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
-
-        # add the recurrent connections
-        configs.append("# projection matrices : Wrm and Wpm")
-        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
-        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
-
-        configs.append("# r_t and p_t : rp_t will be the output")
-        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
-        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
-        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
-        configs.append("### End LTSM layer '{0}'".format(name))
-
-        return configs
-
 
 # This class is for lines like
 #   'fast-lstm-layer name=lstm1 input=[-1] delay=-3'
@@ -588,6 +473,12 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
+#   max-cell-value=-1        [If >0, an approximate maximum on the contents of the cell (c_t);
+#                             enforced by putting a scaling factor of
+#                             recurrence_scale = 1 - abs(delay)/max_cell_value
+#                             on the recurrence, i.e. the term c_{t-1} in the LSTM equations.
+#                             E.g. setting this to 50 means the activations can't get bigger
+#                             than about 50.]
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstm-layer"
@@ -607,7 +498,8 @@ def set_default_configs(self):
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
-                        'ng-affine-options' : ' max-change=1.5'
+                        'ng-affine-options' : ' max-change=1.5',
+                        'max-cell-value':  -1.0
                         }
         self.c_needed = False  # keep track of whether the 'c' output is needed.
 
@@ -619,6 +511,8 @@ def check_configs(self):
         key = 'cell-dim'
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
 
 
 
@@ -666,17 +560,23 @@ def generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         delay = self.config['delay']
+        affine_str = self.config['ng-affine-options']
+        max_cell_value = self.config['max-cell-value']
+        # we expect max_cell_value to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if max_cell_value < 0 else
+                            1.0 - (abs(delay) / max_cell_value))
+        assert recurrence_scale > 0   # or user may have set max-cell-value much
+                                      # too small.
+        lstm_str = self.config['lstm-nonlinearity-options']
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
-        lstm_str = self.config['lstm-nonlinearity-options']
-
+                                abs(delay), recurrence_scale))
 
         configs = []
 
@@ -739,6 +639,12 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
+#   max-cell-value=-1        [If >0, an approximate maximum on the contents of the cell (c_t);
+#                             enforced by putting a scaling factor of
+#                             recurrence_scale = 1 - abs(delay)/max_cell_value
+#                             on the recurrence, i.e. the term c_{t-1} in the LSTM equations.
+#                             E.g. setting this to 50 means the activations can't get bigger
+#                             than about 50.]
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstmp-layer"
@@ -759,6 +665,7 @@ def set_default_configs(self):
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
+                        'max-cell-value':  -1.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0
 
@@ -781,7 +688,8 @@ def check_configs(self):
             if self.config[key] <= 0:
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
-
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
@@ -833,20 +741,28 @@ def generate_lstm_config(self):
         input_dim = self.descriptors['input']['dim']
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
         rec_proj_dim = self.config['recurrent-projection-dim']
         nonrec_proj_dim = self.config['non-recurrent-projection-dim']
-        delay = self.config['delay']
+        affine_str = self.config['ng-affine-options']
+        max_cell_value = self.config['max-cell-value']
+        # we expect max_cell_value to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if max_cell_value < 0 else
+                            1.0 - (abs(delay) / max_cell_value))
+        assert recurrence_scale > 0   # or user may have set max-cell-value much
+                                      # too small.
+
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
-        lstm_str = self.config['lstm-nonlinearity-options']
+                                abs(delay), recurrence_scale))
 
+        lstm_str = self.config['lstm-nonlinearity-options']
 
         configs = []
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index cc786d091ac..89458c65152 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -30,7 +30,6 @@
         'affine-layer' : xlayers.XconfigAffineLayer,
         'lstm-layer' : xlayers.XconfigLstmLayer,
         'lstmp-layer' : xlayers.XconfigLstmpLayer,
-        'lstmpc-layer' : xlayers.XconfigLstmpcLayer,
         'fast-lstm-layer' : xlayers.XconfigFastLstmLayer,
         'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer
         }
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 160ff1d089e..9772c31b13b 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -19,7 +19,6 @@
 
 #include <iterator>
 #include <sstream>
-#include <iomanip>
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-computation-graph.h"
 #include "nnet3/nnet-parse.h"
@@ -557,7 +556,7 @@ void StatisticsPoolingComponent::InitFromConfig(ConfigLine *cfl) {
 
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   // do some basic checks here but Check() will check more completely.
   if (!ok || input_dim_ <= 0 || left_context_ + right_context_ <= 0 ||
       num_log_count_features_ < 0)
@@ -888,7 +887,15 @@ void BackpropTruncationComponent::Read(std::istream &is, bool binary) {
   ExpectOneOrTwoTokens(is, binary, "<BackpropTruncationComponent>",
                        "<Dim>");
   ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<ClippingThreshold>");
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<Scale>") {
+    ReadBasicType(is, binary, &scale_);
+    ReadToken(is, binary, &tok);
+  } else {
+    scale_ = 1.0;
+  }
+  KALDI_ASSERT(tok == "<ClippingThreshold>");
   ReadBasicType(is, binary, &clipping_threshold_);
   ExpectToken(is, binary, "<ZeroingThreshold>");
   ReadBasicType(is, binary, &zeroing_threshold_);
@@ -912,6 +919,8 @@ void BackpropTruncationComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<BackpropTruncationComponent>");
   WriteToken(os, binary, "<Dim>");
   WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<Scale>");
+  WriteBasicType(os, binary, scale_);
   WriteToken(os, binary, "<ClippingThreshold>");
   WriteBasicType(os, binary, clipping_threshold_);
   WriteToken(os, binary, "<ZeroingThreshold>");
@@ -958,7 +967,7 @@ void BackpropTruncationComponentPrecomputedIndexes::Read(std::istream &istream,
 std::string BackpropTruncationComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_
-         << ", count=" << std::setprecision(3) << count_ << std::setprecision(6)
+         << ", scale=" << scale_
          << ", clipping-threshold=" << clipping_threshold_
          << ", clipped-proportion="
          << (count_ > 0.0 ? num_clipped_ / count_ : 0)
@@ -971,14 +980,15 @@ std::string BackpropTruncationComponent::Info() const {
   return stream.str();
 }
 
-void BackpropTruncationComponent::Init(int32 dim,
-                                 BaseFloat clipping_threshold,
-                                 BaseFloat zeroing_threshold,
-                                 int32 zeroing_interval,
-                                 int32 recurrence_interval) {
+void BackpropTruncationComponent::Init(
+    int32 dim, BaseFloat scale, BaseFloat clipping_threshold,
+    BaseFloat zeroing_threshold, int32 zeroing_interval,
+    int32 recurrence_interval) {
   KALDI_ASSERT(clipping_threshold >= 0 && zeroing_threshold >= 0 &&
-      zeroing_interval > 0 && recurrence_interval > 0 && dim > 0);
+               scale > 0.0 && zeroing_interval > 0 &&
+               recurrence_interval > 0 && dim > 0);
   dim_ = dim;
+  scale_ = scale;
   clipping_threshold_ = clipping_threshold;
   zeroing_threshold_ = zeroing_threshold;
   zeroing_interval_ = zeroing_interval;
@@ -993,9 +1003,11 @@ void BackpropTruncationComponent::Init(int32 dim,
 void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
   int32 dim = 0;
   bool ok = cfl->GetValue("dim", &dim);
-  BaseFloat clipping_threshold = 30.0;
-  BaseFloat zeroing_threshold = 15.0;
+  BaseFloat scale = 1.0,
+      clipping_threshold = 30.0,
+      zeroing_threshold = 15.0;
   int32 zeroing_interval = 20, recurrence_interval = 1;
+  cfl->GetValue("scale", &scale);
   cfl->GetValue("clipping-threshold", &clipping_threshold);
   cfl->GetValue("zeroing-threshold", &zeroing_threshold);
   cfl->GetValue("zeroing-interval", &zeroing_interval);
@@ -1005,7 +1017,7 @@ void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
       recurrence_interval < 1 || dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim, clipping_threshold, zeroing_threshold,
+  Init(dim, scale, clipping_threshold, zeroing_threshold,
       zeroing_interval, recurrence_interval);
 }
 
@@ -1013,6 +1025,7 @@ void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
 Component* BackpropTruncationComponent::Copy() const {
   BackpropTruncationComponent *ans = new BackpropTruncationComponent();
   ans->dim_ = dim_;
+  ans->scale_ = scale_;
   ans->clipping_threshold_ = clipping_threshold_;
   ans->zeroing_threshold_ = zeroing_threshold_;
   ans->zeroing_interval_ = zeroing_interval_;
@@ -1066,6 +1079,8 @@ void BackpropTruncationComponent::Propagate(
                                  const CuMatrixBase<BaseFloat> &in,
                                  CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
+  if (scale_ != 1.0)
+    out->Scale(scale_);
 }
 
 // virtual
@@ -1084,6 +1099,8 @@ void BackpropTruncationComponent::Backprop(const std::string &debug_info,
   // the following statement will do nothing if in_deriv and out_deriv have same
   // memory.
   in_deriv->CopyFromMat(out_deriv);
+  if (scale_ != 1.0)
+    in_deriv->Scale(scale_);
 
   BackpropTruncationComponent *to_update =
       dynamic_cast<BackpropTruncationComponent*>(to_update_in);
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 9750852544e..f389d019522 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -156,9 +156,9 @@ class DistributeComponentPrecomputedIndexes:
   StatisticsPoolingComponent to extract moving-average mean and
   standard-deviation statistics.
 
-  StatisticsExtractionComponent is designed to extract statistics-- 0th-order,
+  StatisticsExtractionExomponent designed to extract statistics-- 0th-order,
   1st-order and optionally diagonal 2nd-order stats-- from small groups of
-  frames, such as 10 frames.  The statistics will then be further processed by
+  frames, such as 10 frame.  The statistics will then be further processed by
   StatisticsPoolingComponent to compute moving-average means and (if configured)
   standard deviations.  The reason for the two-component way of doing this is
   efficiency, particularly in the graph-compilation phase.  (Otherwise there
@@ -185,7 +185,7 @@ class DistributeComponentPrecomputedIndexes:
   An output of this component will be 'computable' any time at least one of
   the corresponding inputs is computable.
 
-  In all cases the first dimension of the output will be a count (between 1 and
+   In all cases the first dimension of the output will be a count (between 1 and
   10 inclusive in this example).  If include-variance=false, then the output
   dimension will be input-dim + 1.  and the output dimensions >0 will be
   1st-order statistics (sums of the input).  If include-variance=true, then the
@@ -448,21 +448,22 @@ class StatisticsPoolingComponentPrecomputedIndexes:
 class BackpropTruncationComponent: public Component {
  public:
   BackpropTruncationComponent(int32 dim,
+                              BaseFloat scale,
                               BaseFloat clipping_threshold,
                               BaseFloat zeroing_threshold,
                               int32 zeroing_interval,
                               int32 recurrence_interval) {
-    Init(dim, clipping_threshold, zeroing_threshold,
+    Init(dim, scale, clipping_threshold, zeroing_threshold,
         zeroing_interval, recurrence_interval);}
 
-  BackpropTruncationComponent(): dim_(0), clipping_threshold_(-1),
+  BackpropTruncationComponent(): dim_(0), scale_(1.0), clipping_threshold_(-1),
     zeroing_threshold_(-1), zeroing_interval_(0), recurrence_interval_(0),
     num_clipped_(0), num_zeroed_(0), count_(0), count_zeroing_boundaries_(0) { }
 
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
   virtual void InitFromConfig(ConfigLine *cfl);
-  void Init(int32 dim, BaseFloat clipping_threshold,
+  void Init(int32 dim, BaseFloat scale, BaseFloat clipping_threshold,
             BaseFloat zeroing_threshold, int32 zeroing_interval,
             int32 recurrence_interval);
 
@@ -506,6 +507,12 @@ class BackpropTruncationComponent: public Component {
   // input/output dimension
   int32 dim_;
 
+  // Scale that is applied in the forward propagation (and of course in the
+  // backprop to match.  Expected to normally be 1, but setting this to other
+  // values (e.g.  slightly less than 1) can be used to produce variants of
+  // LSTMs where the activations are bounded.
+  BaseFloat scale_;
+
   // threshold (e.g., 30) to be used for clipping corresponds to max-row-norm
   BaseFloat clipping_threshold_;
 
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 205fc031323..7ab46d1003e 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -543,6 +543,7 @@ void GenerateConfigSequenceLstmWithTruncation(
   int32 clipping_threshold = RandInt(6, 50),
       zeroing_threshold = RandInt(1,  5),
       zeroing_interval = RandInt(1, 5) * 10;
+  BaseFloat scale = 0.8 + 0.1*RandInt(0,3);
 
   os << "input-node name=input dim=" << input_dim << std::endl;
 
@@ -616,12 +617,14 @@ void GenerateConfigSequenceLstmWithTruncation(
      << " output-dim=" << cell_dim << std::endl;
   os << "component name=c type=BackpropTruncationComponent dim="
      << cell_dim
+     << " scale=" << scale
      << " clipping-threshold=" << clipping_threshold
      << " zeroing-threshold=" << zeroing_threshold
      << " zeroing-interval=" << zeroing_interval
      << " recurrence-interval=1" << std::endl;
   os << "component name=r type=BackpropTruncationComponent dim="
      << projection_dim
+     << " scale=" << scale
      << " clipping-threshold=" << clipping_threshold
      << " zeroing-threshold=" << zeroing_threshold
      << " zeroing-interval=" << zeroing_interval

From 29c7efd2e7b66a1467330141060c420ae317e5c4 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 5 Jan 2017 00:41:23 -0500
Subject: [PATCH 046/213] Add steps/nnet3/decode_looped.sh

---
 egs/wsj/s5/steps/nnet3/decode.sh        |   2 -
 egs/wsj/s5/steps/nnet3/decode_looped.sh | 193 ++++++++++++++++++++++++
 2 files changed, 193 insertions(+), 2 deletions(-)
 create mode 100755 egs/wsj/s5/steps/nnet3/decode_looped.sh

diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 10ac29e1c59..b97e7f415d7 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -23,7 +23,6 @@ ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
 num_threads=1 # if >1, will use gmm-latgen-faster-parallel
-parallel_opts=  # ignored now.
 scoring_opts=
 skip_diagnostics=false
 skip_scoring=false
@@ -56,7 +55,6 @@ if [ $# -ne 3 ]; then
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
-  echo "  --parallel-opts <opts>                   # e.g. '--num-threads 4' if you supply --num-threads 4"
   exit 1;
 fi
 
diff --git a/egs/wsj/s5/steps/nnet3/decode_looped.sh b/egs/wsj/s5/steps/nnet3/decode_looped.sh
new file mode 100755
index 00000000000..8850045c9a3
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_looped.sh
@@ -0,0 +1,193 @@
+#!/bin/bash
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+
+# This is like decode.sh except it uses "looped" decoding.  This is an nnet3
+# mechanism for reusing previously computed activations when we evaluate the
+# neural net for successive chunks of data.  It is applicable to TDNNs and LSTMs
+# and similar forward-recurrent topologies, but not to backward-recurrent
+# topologies like BLSTMs.  Be careful because the script itself does not have a
+# way to figure out what kind of topology you are using.
+#
+# Also be aware that this decoding mechanism means that you have effectively
+# unlimited context within the utterance.  Unless your models were trained (at
+# least partly) on quite large chunk-sizes, e.g. 100 or more (although the
+# longer the BLSTM recurrence the larger chunk-size you'd need in training),
+# there is a possibility that this effectively infinite left-context will cause
+# a mismatch with the training condition.  Also, for recurrent topologies, you may want to make sure
+# that the --extra-left-context-initial matches the --egs.chunk-left-context-initial
+# that you trained with, .  [note: if not specified during training, it defaults to
+# the same as the regular --extra-left-context
+
+# This script does decoding with a neural-net.  If the neural net was built on
+# top of fMLLR transforms from a conventional system, you should provide the
+# --transform-dir option.
+
+# Begin configuration section.
+stage=1
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context_initial=0
+feat_type=
+online_ivector_dir=
+minimize=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features.
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+  echo "$0: feature type is $feat_type"
+fi
+
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+case $feat_type in
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && \
+    ! cmp $transform_dir/../final.mat $srcdir/final.mat && \
+    ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then
+  echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+##
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster-looped $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;

From 798fb667b4cb385edcc69e9b026b84d2276e1cc4 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 6 Jan 2017 15:19:54 -0500
Subject: [PATCH 047/213] Updating egs-generation scripts to use new-style
 options for iVectors

---
 egs/wsj/s5/steps/nnet3/align.sh                  |  2 --
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh          | 12 +++++-------
 egs/wsj/s5/steps/nnet3/get_egs.sh                | 12 +++++-------
 egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh | 16 +++++++---------
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh        | 13 +++++--------
 5 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index 9befe16164f..713ecc128da 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -124,7 +124,6 @@ fi
 ivector_opts=
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-  # note: subsample-feats, with negative n, will repeat each feature -n times.
   ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 fi
 
@@ -153,4 +152,3 @@ $cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \
 steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
 
 echo "$0: done aligning data."
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 94bf322a514..76c77a38c46 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -216,11 +216,9 @@ if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
+  ivector_opts=""
   echo 0 >$dir/info/ivector_dim
 fi
 
@@ -322,14 +320,14 @@ if [ $stage -le 3 ]; then
     lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
     chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
       ark:- ark:- \| \
-    nnet3-chain-get-egs $valid_ivector_opt --srand=$srand \
+    nnet3-chain-get-egs $ivector_opts --srand=$srand \
       $egs_opts $chaindir/normalization.fst \
       "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
     lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
     chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
-    nnet3-chain-get-egs $train_subset_ivector_opt --srand=$srand \
+    nnet3-chain-get-egs $ivector_opts --srand=$srand \
       $egs_opts $chaindir/normalization.fst \
       "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error &
   wait;
@@ -381,7 +379,7 @@ if [ $stage -le 4 ]; then
     lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
     chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
-    nnet3-chain-get-egs $ivector_opt --srand=\$[JOB+$srand] $egs_opts \
+    nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
       --num-frames-overlap=$frames_overlap_per_eg \
      "$feats" ark,s,cs:- ark:- \| \
     nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 6622f3632f7..330f4d8c7d3 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -189,11 +189,9 @@ if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
+  ivector_opts=""
   echo 0 >$dir/info/ivector_dim
 fi
 
@@ -291,11 +289,11 @@ if [ $stage -le 3 ]; then
     <$dir/ali.scp >$dir/ali_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $egs_opts "$valid_feats" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$valid_feats" \
     "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $egs_opts "$train_subset_feats" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$train_subset_feats" \
      "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
      "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
@@ -335,7 +333,7 @@ if [ $stage -le 4 ]; then
   echo "$0: Generating training examples on disk"
   # The examples will go round-robin to egs_list.
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts "$feats" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$feats" \
     "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
     nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index 406b998fc71..017fd12acee 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -207,11 +207,9 @@ if [ ! -z $online_ivector_dir ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period)
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim >$dir/info/ivector_dim
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  priors_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+else
+  ivector_opts=""
 fi
 
 if [ $stage -le 2 ]; then
@@ -345,7 +343,7 @@ fi
     num_pdfs=`am-info $alidir/final.mdl | grep pdfs | awk '{print $NF}' 2>/dev/null` || exit 1
 
     $cmd $dir/log/create_priors_subset.log \
-      nnet3-get-egs --num-pdfs=$num_pdfs $priors_ivector_opt $priors_egs_opts "$priors_feats" \
+      nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $priors_egs_opts "$priors_feats" \
       "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \
       ark:- \| nnet3-copy-egs ark:- $priors_egs_list || \
       { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; }
@@ -372,13 +370,13 @@ if [ $stage -le 4 ]; then
   $cmd $dir/log/create_valid_subset.log \
     discriminative-get-supervision $supervision_all_opts \
     scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
-    nnet3-discriminative-get-egs $valid_ivector_opt $egs_opts \
+    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
     $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset.log \
     discriminative-get-supervision $supervision_all_opts \
     scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
-    nnet3-discriminative-get-egs $train_subset_ivector_opt $egs_opts \
+    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
     $dir/final.mdl "$train_subset_feats" ark,s,cs:- "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
@@ -408,7 +406,7 @@ if [ $stage -le 5 ]; then
     discriminative-get-supervision $supervision_all_opts \
     "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
     "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \
-    nnet3-discriminative-get-egs $ivector_opt $egs_opts \
+    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
        --num-frames-overlap=$frames_overlap_per_eg \
     $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \
     nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 7bd8fa5f983..04830a4bc05 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -182,11 +182,9 @@ if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
+  ivector_opts=""
   echo 0 >$dir/info/ivector_dim
 fi
 
@@ -306,12 +304,12 @@ if [ $stage -le 3 ]; then
   rm -f $dir/.error 2>/dev/null
   $cmd $dir/log/create_valid_subset.log \
     $get_egs_program \
-    $valid_ivector_opt $egs_opts "$valid_feats" \
+    $ivector_opts $egs_opts "$valid_feats" \
     "$valid_targets" \
     "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
     $get_egs_program \
-    $train_subset_ivector_opt $egs_opts "$train_subset_feats" \
+    $ivector_opts $egs_opts "$train_subset_feats" \
     "$train_subset_targets" \
     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
@@ -352,7 +350,7 @@ if [ $stage -le 4 ]; then
   # The examples will go round-robin to egs_list.
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
     $get_egs_program \
-    $ivector_opt $egs_opts "$feats" "$targets" \
+    $ivector_opts $egs_opts "$feats" "$targets" \
     ark:- \| \
     nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
@@ -411,4 +409,3 @@ if [ $stage -le 6 ]; then
 fi
 
 echo "$0: Finished preparing training examples"
-

From 1b6f7f6f1bc6fb12e3b8ac310215d0542cd99399 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 6 Jan 2017 15:21:09 -0500
Subject: [PATCH 048/213] Updating SetZero() to also zero stats.

---
 src/nnet3/nnet-utils.cc | 7 +++++++
 src/nnet3/nnet-utils.h  | 8 +++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index dbe676de1ef..02b92c19a40 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -172,10 +172,17 @@ void SetZero(bool is_gradient,
              Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *comp = nnet->GetComponent(c);
+    NonlinearComponent *nc = dynamic_cast<NonlinearComponent*>(comp);
     if (comp->Properties() & kUpdatableComponent) {
       UpdatableComponent *u_comp = dynamic_cast<UpdatableComponent*>(comp);
       KALDI_ASSERT(u_comp != NULL);
       u_comp->SetZero(is_gradient);
+    } else if (nc != NULL) {
+      nc->ZeroStats();
+    } else {
+      // Scale(0.0) is called as a backup; currently it should never
+      // do anything  useful for any component type.
+      comp->Scale(0.0);
     }
   }
 }
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 8acdbfd9b96..75c75842817 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -54,7 +54,13 @@ int32 NumOutputNodes(const Nnet &nnet);
 int32 NumInputNodes(const Nnet &nnet);
 
 /// Calls SetZero (with the given is_gradient parameter) on all updatable
-/// components of the nnet.
+/// components of the nnet; calls ZeroComponentStats on all other components
+/// that inherit from NonlinearComponent; and (just in case) calls Scale(0.0) on
+/// all other components.
+/// It's the same as ScaleNnet(0.0, nnet) except that if is_gradient is true it
+/// can set the is_gradient_ flag on updatable components [to force simple
+/// update]; and unlike ScaleNnet(0.0, nnet) it will get rid of NaNs that have
+/// crept into the parameters or stats.
 void SetZero(bool is_gradient,
              Nnet *nnet);
 

From 6e648b92296441c5d0c306da292c982f75d64614 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 6 Jan 2017 15:27:24 -0500
Subject: [PATCH 049/213] tedlium script changes; renaming max-cell-value to
 decay-time in LSTM scripts

---
 .../s5_r2/local/chain/compare_wer_general.sh  |  50 +--
 egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh   | 198 +++++++++++
 .../s5_r2/local/chain/tuning/run_tdnn_1c.sh   | 282 ++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1b.sh    | 317 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1c.sh    | 279 +++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    | 312 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    | 314 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    | 314 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    | 316 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1h.sh    | 316 +++++++++++++++++
 egs/tedlium/s5_r2/run.sh                      |   2 +-
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   |  54 +--
 12 files changed, 2704 insertions(+), 50 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh

diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
index aebbd66349a..b8988fc8d1a 100755
--- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -1,38 +1,37 @@
 #!/bin/bash
 
+
 echo $0 $*
 
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+
 echo -n "System               "
 for x in $*; do   printf "% 10s" " $(basename $x)";   done
 echo
 
-echo -n "WER on dev(orig)    "
-for x in $*; do
-  wer=$(grep Sum $x/decode_dev/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on dev(rescored)"
-for x in $*; do
-  wer=$(grep Sum $x/decode_dev_rescore/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on test(orig)    "
-for x in $*; do
-  wer=$(grep Sum $x/decode_test/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
+dirnames=(dev dev_rescore test test_rescore)
+strings=("WER on dev(orig)     " "WER on dev(rescored) " "WER on test(orig)    " "WER on test(rescored)")
 
-echo -n "WER on test(rescored)"
-for x in $*; do
-  wer=$(grep Sum $x/decode_test_rescore/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
+for n in 0 1 2 3; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     wer=$(grep Sum $x/decode_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "        [looped:]    "
+     for x in $*; do
+       wer=$(grep Sum $x/decode_looped_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
 done
-echo
 
 
 echo -n "Final train prob     "
@@ -61,4 +60,5 @@ for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
+
 echo
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
new file mode 100755
index 00000000000..9e795316352
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
@@ -0,0 +1,198 @@
+#!/bin/bash
+
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale-nonlinearity 0.00001 \
+    --feat-dir data/${train_set}_sp_hires_comb \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $tree_dir \
+    --relu-dim 550 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize 0.1 \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..111a68d9878
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,282 @@
+#!/bin/bash
+
+# run_tdnn_1c.sh is like run_tdnn_1b.sh but changing chunk-width from 150 to
+# '140,110,160', and
+# and --trainer.num-chunk-per-minibatch from 128 to 128,64
+
+# run_tdnn_1b.sh is like run_tdnn_1a.sh but upgrading to xconfig-based
+# config generation.
+
+# Results (11/29/2016, note, this build is is before the upgrade of the LM
+#   done in Nov 2016):
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_sp_bi exp/chain_cleaned/tdnn1b_sp_bi
+# System                tdnn_sp_bi tdnn1b_sp_bi
+# WER on dev(orig)          10.3      10.2
+# WER on dev(rescored)       9.8       9.6
+# WER on test(orig)           9.8       9.7
+# WER on test(rescored)       9.3       9.2
+# Final train prob        -0.0918   -0.0928
+# Final valid prob        -0.1190   -0.1178
+# Final train prob (xent)   -1.3572   -1.4666
+# Final valid prob (xent)   -1.4415   -1.5473
+
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
+# to get the configuration.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1c  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width '140,110,160' \
+    --trainer.num-chunk-per-minibatch '128,64' \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..5149e5a54e8
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,317 @@
+#!/bin/bash
+
+# this is as run_tdnn_lstm_1a.sh, but changing
+# frames_per_chunk  150 to  140,100,160
+# and --trainer.num-chunk-per-minibatch from 128 to 128,64
+# and adding
+#    --egs.chunk-left-context-initial=0
+# and  --egs.chunk-right-context-final=0
+
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1b  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
new file mode 100755
index 00000000000..bb3c5b1a942
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+
+# 1c is as 1b, but adding the option --slow-start true. [since removed; it
+# takes half the param change from the first two minibatches of each
+# job].  The difference is probably just random noise.
+
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1b_sp_bi exp/chain_cleaned/tdnn_lstm1c_sp_bi
+# System                tdnn_lstm1b_sp_bi tdnn_lstm1c_sp_bi
+# WER on dev(orig)           9.1       8.9
+# WER on dev(rescored)       8.4       8.2
+# WER on test(orig)           8.9       8.9
+# WER on test(rescored)       8.4       8.5
+# Final train prob        -0.0621   -0.0620
+# Final valid prob        -0.0799   -0.0811
+# Final train prob (xent)   -0.8300   -0.8117
+# Final valid prob (xent)   -0.9500   -0.9448
+
+
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1c  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.slow-start true \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
new file mode 100755
index 00000000000..28ca16d939c
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -0,0 +1,312 @@
+#!/bin/bash
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1d  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=40 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=40 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=40 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
new file mode 100755
index 00000000000..5bfdc68fa3f
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -0,0 +1,314 @@
+#!/bin/bash
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1e  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
new file mode 100755
index 00000000000..ed778713907
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -0,0 +1,314 @@
+#!/bin/bash
+
+# 1f is as 1b, but increasing decay-time from 40 to 80.  [see also 1e, at 20.]
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1f  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=80 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=80 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=80 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
new file mode 100755
index 00000000000..bbc17c77aea
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+
+#######################
+# 1g is as 1e, but reducing decay-time further from 20 to 10.
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1g  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=10 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=10 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=10 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
new file mode 100755
index 00000000000..8ffd43f27bc
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+
+#######################
+# 1h is as 1e, but increasing decay-time from to to 30.
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1h  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=30 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=30 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=30 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/run.sh b/egs/tedlium/s5_r2/run.sh
index 19bc92a738c..754cec0494d 100755
--- a/egs/tedlium/s5_r2/run.sh
+++ b/egs/tedlium/s5_r2/run.sh
@@ -185,7 +185,7 @@ fi
 if [ $stage -le 17 ]; then
   # This will only work if you have GPUs on your system (and note that it requires
   # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
-  local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix ""
+  local/chain/run_tdnn.sh
 fi
 
 # The nnet3 TDNN recipe:
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 45a425dbc60..5e928a0f7c3 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -473,12 +473,15 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
-#   max-cell-value=-1        [If >0, an approximate maximum on the contents of the cell (c_t);
-#                             enforced by putting a scaling factor of
-#                             recurrence_scale = 1 - abs(delay)/max_cell_value
-#                             on the recurrence, i.e. the term c_{t-1} in the LSTM equations.
-#                             E.g. setting this to 50 means the activations can't get bigger
-#                             than about 50.]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstm-layer"
@@ -499,7 +502,7 @@ def set_default_configs(self):
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
-                        'max-cell-value':  -1.0
+                        'decay-time':  -1.0
                         }
         self.c_needed = False  # keep track of whether the 'c' output is needed.
 
@@ -561,11 +564,11 @@ def generate_lstm_config(self):
         cell_dim = self.config['cell-dim']
         delay = self.config['delay']
         affine_str = self.config['ng-affine-options']
-        max_cell_value = self.config['max-cell-value']
-        # we expect max_cell_value to be either -1, or large, like 10 or 50.
-        recurrence_scale = (1.0 if max_cell_value < 0 else
-                            1.0 - (abs(delay) / max_cell_value))
-        assert recurrence_scale > 0   # or user may have set max-cell-value much
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
                                       # too small.
         lstm_str = self.config['lstm-nonlinearity-options']
         bptrunc_str = ("clipping-threshold={0}"
@@ -639,12 +642,15 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
-#   max-cell-value=-1        [If >0, an approximate maximum on the contents of the cell (c_t);
-#                             enforced by putting a scaling factor of
-#                             recurrence_scale = 1 - abs(delay)/max_cell_value
-#                             on the recurrence, i.e. the term c_{t-1} in the LSTM equations.
-#                             E.g. setting this to 50 means the activations can't get bigger
-#                             than about 50.]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstmp-layer"
@@ -665,7 +671,7 @@ def set_default_configs(self):
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
-                        'max-cell-value':  -1.0,
+                        'decay-time':  -1.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0
 
@@ -745,11 +751,11 @@ def generate_lstm_config(self):
         rec_proj_dim = self.config['recurrent-projection-dim']
         nonrec_proj_dim = self.config['non-recurrent-projection-dim']
         affine_str = self.config['ng-affine-options']
-        max_cell_value = self.config['max-cell-value']
-        # we expect max_cell_value to be either -1, or large, like 10 or 50.
-        recurrence_scale = (1.0 if max_cell_value < 0 else
-                            1.0 - (abs(delay) / max_cell_value))
-        assert recurrence_scale > 0   # or user may have set max-cell-value much
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
                                       # too small.
 
         bptrunc_str = ("clipping-threshold={0}"

From d128b92613005fb1e869ca2c395d26e8ff3e0556 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 6 Jan 2017 17:55:27 -0500
Subject: [PATCH 050/213] Add timing info to looped compilation

---
 src/nnet3/nnet-compile-looped.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index 62f29762580..1237ba6ce1e 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -268,6 +268,7 @@ static bool CompileLoopedInternal(
     const ComputationRequest &request3,
     int32 num_requests,
     NnetComputation *computation) {
+
   KALDI_ASSERT(num_requests >= 3);
   std::vector<ComputationRequest> extra_requests(num_requests - 3);
   const ComputationRequest *prev_request = &request2;
@@ -312,11 +313,15 @@ void CompileLooped(const Nnet &nnet,
   int32 num_requests1 = 5, factor = 2, max_requests = 100,
       num_requests;
 
+  Timer timer;
+
   for (num_requests = num_requests1; num_requests <= max_requests;
        num_requests *= factor) {
     if (CompileLoopedInternal(nnet, optimize_opts,
                              request1, request2, request3,
                              num_requests, computation)) {
+      KALDI_LOG << "Spent " << timer.Elapsed()
+                << " seconds in looped nnet3 compilation.";
       return;
     } else {
       KALDI_VLOG(2) << "Looped compilation failed with "

From 55907735f9b9ec1d8d5f60194e115f7d1677a985 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 7 Jan 2017 15:41:00 -0500
Subject: [PATCH 051/213] Various bug fixes in scripts and code

---
 .../nnet3/train/frame_level_objf/common.py    | 20 +++++--------
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh       |  2 +-
 egs/wsj/s5/steps/nnet3/get_egs.sh             | 11 ++++---
 .../s5/steps/nnet3/get_egs_discriminative.sh  | 14 ++++-----
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh     |  9 ++++--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 11 -------
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 14 ++-------
 egs/wsj/s5/utils/queue.pl                     |  9 +++---
 src/nnet3/nnet-compute.cc                     | 29 +++++++++----------
 src/nnet3/nnet-example-utils.cc               |  1 +
 src/nnet3/nnet-example-utils.h                |  3 --
 src/nnet3bin/nnet3-get-egs.cc                 |  2 ++
 12 files changed, 52 insertions(+), 73 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 0826c9f0468..377a0575266 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -134,8 +134,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         num_hidden_layers, add_layers_period,
                         left_context, right_context,
                         momentum, max_param_change, shuffle_buffer_size,
-                        run_opts,
-                        cv_minibatch_size_str='256', frames_per_eg=-1,
+                        run_opts, frames_per_eg=-1,
                         min_deriv_time=None, max_deriv_time_relative=None,
                         shrinkage_value=1.0,
                         get_raw_nnet_from_am=True,
@@ -182,7 +181,6 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         dir=dir, iter=iter, egs_dir=egs_dir,
         left_context=left_context, right_context=right_context,
         run_opts=run_opts,
-        minibatch_size_str=cv_minibatch_size_str,
         get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False,
         background_process_handler=background_process_handler)
 
@@ -192,7 +190,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                          left_context=left_context,
                          right_context=right_context,
                          run_opts=run_opts,
-                         minibatch_size_str=cv_minibatch_size_str, wait=False,
+                         wait=False,
                          get_raw_nnet_from_am=get_raw_nnet_from_am,
                          background_process_handler=background_process_handler)
 
@@ -365,7 +363,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
 
 
 def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
-                                   right_context, run_opts, minibatch_size_str='1:256',
+                                   right_context, run_opts,
                                    wait=False, background_process_handler=None,
                                    get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
@@ -382,12 +380,11 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/valid_diagnostic.egs ark:- | \
-                    nnet3-merge-egs --minibatch-size={minibatch_size_str} ark:- \
+                    nnet3-merge-egs --minibatch-size=1:64 ark:- \
                     ark:- |" """.format(command=run_opts.command,
                                         dir=dir,
                                         iter=iter,
                                         context_opts=context_opts,
-                                        minibatch_size_str=minibatch_size_str,
                                         model=model,
                                         egs_dir=egs_dir),
         wait=wait, background_process_handler=background_process_handler)
@@ -397,20 +394,18 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/train_diagnostic.egs ark:- | \
-                    nnet3-merge-egs --minibatch-size={minibatch_size_str} ark:- \
+                    nnet3-merge-egs --minibatch-size=1:64 ark:- \
                     ark:- |" """.format(command=run_opts.command,
                                         dir=dir,
                                         iter=iter,
                                         context_opts=context_opts,
-                                        minibatch_size_str=minibatch_size_str,
                                         model=model,
                                         egs_dir=egs_dir),
         wait=wait, background_process_handler=background_process_handler)
 
 
 def compute_progress(dir, iter, egs_dir, left_context, right_context,
-                     run_opts, minibatch_size_str=256,
-                     background_process_handler=None, wait=False,
+                     run_opts, background_process_handler=None, wait=False,
                      get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
         prev_model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(
@@ -429,13 +424,12 @@ def compute_progress(dir, iter, egs_dir, left_context, right_context,
                     nnet3-show-progress --use-gpu=no "{prev_model}" "{model}" \
                     "ark,bg:nnet3-copy-egs {context_opts} \
                         ark:{egs_dir}/train_diagnostic.egs ark:- | \
-                        nnet3-merge-egs --minibatch-size={minibatch_size_str} ark:- \
+                        nnet3-merge-egs --minibatch-size=1:64 ark:- \
                         ark:- |" """.format(command=run_opts.command,
                                             dir=dir,
                                             iter=iter,
                                             model=model,
                                             context_opts=context_opts,
-                                            minibatch_size_str=minibatch_size_str,
                                             prev_model=prev_model,
                                             egs_dir=egs_dir),
             wait=wait, background_process_handler=background_process_handler)
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 76c77a38c46..3ca2fc84627 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -216,7 +216,7 @@ if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
   ivector_opts=""
   echo 0 >$dir/info/ivector_dim
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 330f4d8c7d3..27877680982 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -189,7 +189,7 @@ if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
   ivector_opts=""
   echo 0 >$dir/info/ivector_dim
@@ -201,8 +201,11 @@ if [ $stage -le 1 ]; then
   echo $num_frames > $dir/info/num_frames
   echo "$0: working out feature dim"
   feats_one="$(echo $feats | sed s/JOB/1/g)"
-  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
-  echo $feat_dim > $dir/info/feat_dim
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
 else
   num_frames=$(cat $dir/info/num_frames) || exit 1;
   feat_dim=$(cat $dir/info/feat_dim) || exit 1;
@@ -220,7 +223,7 @@ if [ $num_archives -eq 1 ]; then
   echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
   echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
   sleep 4
-done
+fi
 
 # We may have to first create a smaller number of larger archives, with number
 # $num_archives_intermediate, if $num_archives is more than the maximum number
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index 017fd12acee..fd616160632 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -207,7 +207,7 @@ if [ ! -z $online_ivector_dir ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period)
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim >$dir/info/ivector_dim
-  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
   ivector_opts=""
 fi
@@ -217,12 +217,12 @@ if [ $stage -le 2 ]; then
   num_frames=$(steps/nnet2/get_num_frames.sh $data)
   echo $num_frames > $dir/info/num_frames
   echo "$0: working out feature dim"
-  feats_one="$(echo $feats | sed s/JOB/1/g)"
-  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
-  echo $feat_dim > $dir/info/feat_dim
-else
-  num_frames=$(cat $dir/info/num_frames) || exit 1;
-  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+  feats_one="$(echo $feats | sed s:JOB:1:g)"
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without stderr redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
 fi
 
 # Working out total number of archives. Add one on the assumption the
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 04830a4bc05..4af10e2dde1 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -182,7 +182,7 @@ if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-  ivector_opts="--online-ivectors=$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
   ivector_opts=""
   echo 0 >$dir/info/ivector_dim
@@ -194,8 +194,11 @@ if [ $stage -le 1 ]; then
   echo $num_frames > $dir/info/num_frames
   echo "$0: working out feature dim"
   feats_one="$(echo $feats | sed s:JOB:1:g)"
-  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
-  echo $feat_dim > $dir/info/feat_dim
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without stderr redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
 else
   num_frames=$(cat $dir/info/num_frames) || exit 1;
   feat_dim=$(cat $dir/info/feat_dim) || exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index c2a6028b930..44f4cca8cb6 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -107,13 +107,6 @@ def get_args():
                         steps/nnet3/get_saturation.pl) exceeds this threshold
                         we scale the parameter matrices with the
                         shrink-value.""")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=str,
-                        dest='cv_minibatch_size', default='256',
-                        help="""Size of the minibatch to be used in diagnostic
-                        jobs (use smaller value for BLSTMs to control memory
-                        usage).  May be a more general rule as accepted by the
-                        --minibatch-size option of nnet3-merge-egs; run that
-                        program without args to see the format.""")
     # RNN specific trainer options
     parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
                         dest='num_chunk_per_minibatch', default='100',
@@ -166,9 +159,6 @@ def process_args(args):
     if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
         raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
 
-    if not common_train_lib.validate_minibatch_size_str(args.cv_minibatch_size):
-        raise Exception("--trainer.optimization.cv-minibatch-size has an invalid value");
-
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
 
@@ -426,7 +416,6 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
-                cv_minibatch_size_str=args.cv_minibatch_size,
                 run_opts=run_opts,
                 get_raw_nnet_from_am=False,
                 background_process_handler=background_process_handler)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 422540aee35..d546377a726 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -106,13 +106,6 @@ def get_args():
                         steps/nnet3/get_saturation.pl) exceeds this threshold
                         we scale the parameter matrices with the
                         shrink-value.""")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=str,
-                        dest='cv_minibatch_size', default='256',
-                        help="""Size of the minibatch to be used in diagnostic
-                        jobs (use smaller value for BLSTMs to control memory
-                        usage).  May be a more general rule as accepted by the
-                        --minibatch-size option of nnet3-merge-egs; run that
-                        program without args to see the format.""")
     # RNN specific trainer options
     parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
                         dest='num_chunk_per_minibatch', default='100',
@@ -162,9 +155,6 @@ def process_args(args):
     if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
         raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
 
-    if not common_train_lib.validate_minibatch_size_str(args.cv_minibatch_size):
-        raise Exception("--trainer.optimization.cv-minibatch-size has an invalid value");
-
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
 
@@ -310,7 +300,8 @@ def train(args, run_opts, background_process_handler):
                                         left_context_initial, right_context_final))
     if args.chunk_width != frames_per_eg_str:
         raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
-                        "in the egs dir {0} vs {1}".(args.chunk_width, frames_per_eg_str))
+                        "in the egs dir {0} vs {1}".format(args.chunk_width,
+                                                           frames_per_eg_str))
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
@@ -421,7 +412,6 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
-                cv_minibatch_size_str=args.cv_minibatch_size,
                 run_opts=run_opts,
                 background_process_handler=background_process_handler)
 
diff --git a/egs/wsj/s5/utils/queue.pl b/egs/wsj/s5/utils/queue.pl
index 69188ec074a..10fd3b1a885 100755
--- a/egs/wsj/s5/utils/queue.pl
+++ b/egs/wsj/s5/utils/queue.pl
@@ -91,10 +91,11 @@ ()
 }
 
 sub caught_signal {
-	if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
-		system ("qdel $sge_job_id");
-		die "Caught a signal: $! , deleting SGE task: $sge_job_id and exiting\n";
-	}
+  if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
+    my $signal = $!;
+    system ("qdel $sge_job_id");
+    die "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n";
+  }
 }
 
 if (@ARGV < 2) {
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index d01327c8265..abda3646417 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -453,24 +453,19 @@ void NnetComputer::CheckNoPendingIo() {
     pending_commands_.push_back(program_counter_);
     program_counter_++;
   }
-  while (!pending_commands_.empty()) {
+  for (size_t i = 0; i < pending_commands_.size(); i++) {
     // the order here doesn't really matter; we go from back to front
     // as it's more efficient, not that efficiency really matters here.
-    int32 last_command = pending_commands_.back();
-    if (c[last_command].command_type == kProvideOutput) {
-      // we can ignore that we didn't provide output to the user.
-      KALDI_VLOG(3) << "Output to node '" << nnet_.GetNodeName(c[last_command].arg2)
-                    << "' was available but not used.";
-      pending_commands_.pop_back();
-    } else {
+    int32 command = pending_commands_[i];
+    if (c[command].command_type == kAcceptInput) {
       // we can't ignore if we needed input from the user that hasn't been
       // provided.
-      KALDI_ASSERT(c[last_command].command_type == kAcceptInput);
-      int32 node = c[last_command].arg2;
-      KALDI_ERR << "Cannot run computation because we did not get input for node '"
+      int32 node = c[command].arg2;
+      KALDI_ERR << "Cannot run computation-- we did not get input for node '"
                 << nnet_.GetNodeName(node) << "'";
     }
   }
+  pending_commands_.clear();
 }
 
 int32 NnetComputer::GetIoMatrixIndex(const std::string &node_name, bool is_output) {
@@ -481,9 +476,9 @@ int32 NnetComputer::GetIoMatrixIndex(const std::string &node_name, bool is_outpu
   // first make sure all the I/O commands that we immediately expect, are listed
   // in 'pending_commands_'.
   while (program_counter_ < static_cast<int32>(computation_.commands.size()) &&
-         (c[program_counter_].command_type == kAcceptInput ||
-          c[program_counter_].command_type == kProvideOutput ||
-          c[program_counter_].command_type == kNoOperationMarker)) {
+         ((c[program_counter_].command_type == kAcceptInput ||
+           c[program_counter_].command_type == kProvideOutput ||
+           c[program_counter_].command_type == kNoOperationMarker))) {
     if (c[program_counter_].command_type != kNoOperationMarker)
       pending_commands_.push_back(program_counter_);
     program_counter_++;
@@ -495,7 +490,11 @@ int32 NnetComputer::GetIoMatrixIndex(const std::string &node_name, bool is_outpu
     int32 this_submatrix_index = c[command].arg1,
         this_node_index = c[command].arg2;
     if (this_command_is_output == is_output && node_index == this_node_index) {
-      pending_commands_.erase(pending_commands_.begin() + i);
+      if (!is_output) {
+        pending_commands_.erase(pending_commands_.begin() + i);
+        // don't erase the command for outputs, as that would prevent things
+        // from being output twice, which is an unnecessary restriction.
+      }
       if (!(computation_.IsWholeMatrix(this_submatrix_index)))
         KALDI_ERR << "Getting input or output that is not a whole matrix "
                   << "(probably some optimization code needs to be changed)";
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 28578de42fb..7c3743c3a7f 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -820,6 +820,7 @@ void UtteranceSplitter::GetChunksForUtterance(
                           config_.right_context_final : config_.right_context);
     t += chunk_sizes[i];
   }
+  SetOutputWeights(utterance_length, chunk_info);
   AccStatsForUtterance(utterance_length, *chunk_info);
   // check that the end of the last chunk doesn't go more than
   // 'config_.frame_subsampling_factor - 1' frames past the end
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 021e91959e3..d1eb85b6d11 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -122,9 +122,6 @@ struct ExampleGenerationConfig {
                  "frames of right context of input features that are added "
                  "to each example at the end of the utterance (if <0, this "
                  "defaults to the same as --right-context)");
-    po->Register("right-context", &right_context, "Number of frames of right "
-                 "context of input features that are added to each "
-                 "example");
     po->Register("num-frames", &num_frames_str, "Number of frames with labels "
                 "that each example contains (i.e. the left and right context "
                 "are to be added to this).  May just be an integer (e.g. "
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 562684c30ab..03623f02a07 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -126,6 +126,8 @@ static bool ProcessFile(const MatrixBase<BaseFloat> &feats,
         iter->second *= chunk.output_weights[i];
     }
 
+    eg.io.push_back(NnetIo("output", num_pdfs, 0, labels));
+
     if (compress)
       eg.Compress();
 

From 93dcc071e2970a958013fa32c2ea1c71f9c2c3b2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 8 Jan 2017 15:08:52 -0500
Subject: [PATCH 052/213] Add ConstantComponent [better alternative to
 ConstantFunctionComponent, now deprecated] and test it.

---
 egs/tedlium/s5_r2/local/nnet3/compare_wer.sh |  51 +++++
 src/nnet3/nnet-analyze.cc                    |   6 +-
 src/nnet3/nnet-compile.cc                    |  37 +++-
 src/nnet3/nnet-component-itf.cc              |   2 +
 src/nnet3/nnet-computation-graph.cc          |   8 +-
 src/nnet3/nnet-general-component.cc          | 190 +++++++++++++++++++
 src/nnet3/nnet-general-component.h           |  92 +++++++++
 src/nnet3/nnet-simple-component.h            |   5 +-
 src/nnet3/nnet-test-utils.cc                 |  13 +-
 9 files changed, 389 insertions(+), 15 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/compare_wer.sh

diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..012ea702427
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+
+echo $0 $*
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+
+echo -n "System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+dirnames=(dev dev_rescore test test_rescore)
+strings=("WER on dev(orig)     " "WER on dev(rescored) " "WER on test(orig)    " "WER on test(rescored)")
+
+for n in 0 1 2 3; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     wer=$(grep Sum $x/decode_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "        [looped:]    "
+     for x in $*; do
+       wer=$(grep Sum $x/decode_looped_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+echo -n "Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 956c933d417..c5fedf0240b 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -725,10 +725,10 @@ void ComputationChecker::CheckComputationIndexes() const {
         // note: input may be the empty matrix (in unusual circumstances, for non-simple
         // components).
         if (c.arg3 < 0 || c.arg3 >= num_submatrices ||
-            (c.arg3 == 0 && !(properties & kSimpleComponent)) ||
+            (c.arg3 == 0 && (properties & kSimpleComponent)) ||
             c.arg4 < 1 || c.arg4 >= num_submatrices)
-          KALDI_ERR << "Sub-matrix indexes out of range.";
-        if (submatrices[c.arg3].num_cols != component->InputDim())
+            KALDI_ERR << "Sub-matrix indexes out of range.";
+        if (c.arg3 > 0 && submatrices[c.arg3].num_cols != component->InputDim())
           KALDI_ERR << "Input-dim mismatch.";
         if (submatrices[c.arg4].num_cols != component->OutputDim())
           KALDI_ERR << "Input-dim mismatch.";
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index d31e1ad5289..4ea0ecd5e05 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -51,6 +51,7 @@ void Compiler::CreateComputation(const CompilerOptions &opts,
                                  NnetComputation *computation) {
   computation->Clear();
   ComputationGraphBuilder builder(nnet_, &graph_);
+  // note: there are only >1 segments in a 'looped' computation.
   for (size_t segment = 0; segment < requests_.size(); segment++) {
     builder.Compute(*(requests_[segment]));
     if (!builder.AllOutputsAreComputable()) {
@@ -59,18 +60,23 @@ void Compiler::CreateComputation(const CompilerOptions &opts,
     }
     builder.Prune();
   }
-  // see function declaration's comment for meaning of "phases".
+  // see function declaration's comment for more on the meaning of "phases" (a
+  // phase will later be decomposed into one or more steps).  for each segment
+  // s, phases_per_segment[s] is a list of phases; each phase is a list of
+  // cindex_ids.
   std::vector<std::vector<std::vector<int32> > > phases_per_segment;
   ComputeComputationPhases(nnet_, graph_, &phases_per_segment);
   std::vector<std::vector<int32> > steps;
   steps.reserve(1000);
 
   // maps each step to the segment in which it appears.  in the normal case
-  // (non-online computation), a vector of all zeros.
+  // (non-looped computation), a vector of all zeros.
   std::vector<int32> step_to_segment;
 
 
   {
+    // note: this class will output to 'steps' and to 'cindex_id_to_location_'.
+    // it may incidentally change 'graph_' by adding a few cindexes.
     ComputationStepsComputer steps_computer(nnet_, &graph_, &steps,
                                             &cindex_id_to_location_);
 
@@ -80,7 +86,8 @@ void Compiler::CreateComputation(const CompilerOptions &opts,
       while (step_to_segment.size() < steps.size())
         step_to_segment.push_back(segment);
 
-      // save memory, by deleting the phases we just consumed.
+      // save memory, by deleting the phases we just consumed.  the
+      // following two lines just exist to save memory.
       std::vector<std::vector<int32> > temp;
       phases_per_segment[segment].swap(temp);
     }
@@ -280,10 +287,23 @@ void Compiler::CreateStepInfo(
     for (int32 row_index = 0; row_index < num_ids; row_index++)
       this_info.output_indexes[row_index] =
           graph_.cindexes[this_info.output_cindex_ids[row_index]].second;
-    KALDI_ASSERT(num_ids > 0);
-    // node id's of all Cindexes are the same, so just use first one.
-    this_info.node_index =
-        graph_.cindexes[this_info.output_cindex_ids.front()].first;
+    if (num_ids > 0) {
+      // node id's of all Cindexes are the same, so just use first one.
+      this_info.node_index =
+          graph_.cindexes[this_info.output_cindex_ids.front()].first;
+    } else {
+      // it's possible to have an empty step if it's the component-input step of
+      // a GeneralComponent that does not always have dependencies, such as the
+      // ConstantFunctionComponent.  This is just a kind of placeholder; it will
+      // generate no commands.  The next command works because the next
+      // step will be the propagate for that Component, whose node-index is one
+      // more than the component-input node.
+      KALDI_ASSERT((step+1) < by_step->size() && !(*by_step)[step+1].empty());
+      this_info.node_index =
+          graph_.cindexes[(*by_step)[step+1][0]].first - 1;
+      KALDI_ASSERT(this_info.node_index >= 0);
+      continue;  // we don't need to do anything else for this step.
+    }
     const NetworkNode &node = nnet_.GetNode(this_info.node_index);
     int32 num_rows = num_ids, num_cols = node.Dim(nnet_);
 
@@ -1077,7 +1097,8 @@ void Compiler::OutputDebugInfo(NnetComputation *computation) const {
   computation->matrix_debug_info.resize(num_matrices);
   for (int32 step = 0; step < num_steps; step++) {
     const StepInfo &step_info = steps_[step];
-    KALDI_ASSERT(step_info.value != 0);
+    if (step_info.value == 0)
+      continue;  // e.g. input step for ConstantComponent.
     if (!computation->IsWholeMatrix(step_info.value))
       continue;
     int32 value_matrix = computation->submatrices[step_info.value].matrix_index;
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 00dd802e091..23a8662a0d5 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -143,6 +143,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new StatisticsPoolingComponent();
   } else if (component_type == "ConstantFunctionComponent") {
     ans = new ConstantFunctionComponent();
+  } else if (component_type == "ConstantComponent") {
+    ans = new ConstantComponent();
   } else if (component_type == "DropoutComponent") {
     ans = new DropoutComponent();
   } else if (component_type == "BackpropTruncationComponent") {
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index 1761dd1b775..7c20f3ae711 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -1596,6 +1596,12 @@ void ComputationStepsComputer::ProcessInputOrOutputStep(
 
 int32 ComputationStepsComputer::AddStep(const std::vector<Cindex> &cindexes,
                                         bool add_if_absent) {
+  // note: we can't assert that cindexes is nonempty, because it's possible for
+  // input steps for GeneralComponents to be empty if they require no input
+  // indexes; and because the compiler code expects component steps to be
+  // preceded by component-input steps, we can't just omit these empty steps.
+  // [note: a component-input step is about preparing the input for a component's
+  // propagation.]
   int32 step_index = steps_->size();
   steps_->push_back(std::vector<int32>());
   std::vector<int32> &step = steps_->back();  // vector of cindex_id.
@@ -1639,7 +1645,6 @@ int32 ComputationStepsComputer::AddStep(const std::vector<Cindex> &cindexes,
 
 int32 ComputationStepsComputer::AddStep(std::vector<int32> *cindex_ids) {
   int32 step_index = steps_->size();
-  KALDI_ASSERT(!cindex_ids->empty());
   steps_->push_back(std::vector<int32>());
   steps_->back().swap(*cindex_ids);
   std::vector<int32>::const_iterator iter = steps_->back().begin(),
@@ -1769,6 +1774,7 @@ void ComputationStepsComputer::ProcessComponentStep(
       int32 c = *set_iter;
       input_step.push_back(graph_->cindexes[c]);
     }
+
     // sort the input cindexes.
     std::sort(input_step.begin(), input_step.end());
 
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 9772c31b13b..13ccb0a7714 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1185,5 +1185,195 @@ void BackpropTruncationComponent::Add(BaseFloat alpha,
   num_zeroed_ += alpha * other->num_zeroed_;
 }
 
+
+std::string ConstantComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", " << Type()
+         << ", output-dim=" << OutputDim()
+         << ", is-updatable=" << std::boolalpha << is_updatable_
+         << ", use-natural-gradient=" << std::boolalpha
+         << use_natural_gradient_;
+  PrintParameterStats(stream, "output", output_, true);
+  return stream.str();
+}
+
+ConstantComponent::ConstantComponent():
+    UpdatableComponent(), is_updatable_(true),
+    use_natural_gradient_(true) { }
+
+ConstantComponent::ConstantComponent(
+    const ConstantComponent &other):
+    UpdatableComponent(other), output_(other.output_),
+    is_updatable_(other.is_updatable_),
+    use_natural_gradient_(other.use_natural_gradient_),
+    preconditioner_(other.preconditioner_) { }
+
+void ConstantComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->CopyRowsFromVec(output_);
+}
+
+void ConstantComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // we don't update in_deriv, since we set the flag
+  // kBackpropAdds, and the output doesn't depend on the
+  // input, so the input-derivative is zero.
+  if (to_update_in) {
+    ConstantComponent *to_update =
+      dynamic_cast<ConstantComponent*>(to_update_in);
+    if (to_update->is_updatable_) {
+      // only do the update if the is_updatable_ flag is set.
+      KALDI_ASSERT(to_update && to_update->is_updatable_);
+      if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
+        CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
+        BaseFloat scale = 1.0;
+        to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
+                                                          NULL, &scale);
+        to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
+                                        out_deriv_copy);
+      } else {
+        to_update->output_.AddRowSumMat(to_update->learning_rate_,
+                                        out_deriv);
+      }
+    }
+  }
+}
+
+void ConstantComponent::Read(std::istream &is, bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<ConstantComponent>") {
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<LearningRateFactor>") {
+    ReadBasicType(is, binary, &learning_rate_factor_);
+    ReadToken(is, binary, &token);
+  } else {
+    learning_rate_factor_ = 1.0;
+  }
+  if (token == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ReadToken(is, binary, &token);
+  } else {
+    is_gradient_ = false;
+  }
+  if (token == "<MaxChange>") {
+    ReadBasicType(is, binary, &max_change_);
+    ReadToken(is, binary, &token);
+  } else {
+    max_change_ = 0.0;
+  }
+  if (token == "<LearningRate>") {
+    ReadBasicType(is, binary, &learning_rate_);
+    ReadToken(is, binary, &token);
+  } else {
+    learning_rate_ = 0.001;
+  }
+  if (token != "<Output>") {
+    KALDI_ERR << "Expected token <Output>, got " << token;
+  }
+  output_.Read(is, binary);
+  ExpectToken(is, binary, "<IsUpdatable>");
+  ReadBasicType(is, binary, &is_updatable_);
+  ExpectToken(is, binary, "<UseNaturalGradient>");
+  ReadBasicType(is, binary, &use_natural_gradient_);
+  ExpectToken(is, binary, "</ConstantComponent>");
+}
+
+void ConstantComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
+  WriteToken(os, binary, "<Output>");
+  output_.Write(os, binary);
+  WriteToken(os, binary, "<IsUpdatable>");
+  WriteBasicType(os, binary, is_updatable_);
+  WriteToken(os, binary, "<UseNaturalGradient>");
+  WriteBasicType(os, binary, use_natural_gradient_);
+  WriteToken(os, binary, "</ConstantComponent>");
+}
+
+Component* ConstantComponent::Copy() const {
+  return new ConstantComponent(*this);
+}
+
+void ConstantComponent::Scale(BaseFloat scale) {
+  if (is_updatable_)
+    output_.Scale(scale);
+}
+
+void ConstantComponent::Add(BaseFloat alpha, const Component &other_in) {
+  if (is_updatable_) {
+    const ConstantComponent *other =
+        dynamic_cast<const ConstantComponent*>(&other_in);
+    KALDI_ASSERT(other != NULL);
+    output_.AddVec(alpha, other->output_);
+  }
+}
+
+void ConstantComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  output_.SetZero();
+}
+
+void ConstantComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
+  temp_output.SetRandn();
+  output_.AddVec(stddev, temp_output);
+}
+
+BaseFloat ConstantComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  KALDI_ASSERT(is_updatable_);
+  const ConstantComponent *other =
+      dynamic_cast<const ConstantComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return VecVec(output_, other->output_);
+}
+
+void ConstantComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 output_dim = 0;
+  InitLearningRatesFromConfig(cfl);
+  bool ok = cfl->GetValue("output-dim", &output_dim);
+  cfl->GetValue("is-updatable", &is_updatable_);
+  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
+  BaseFloat output_mean = 0.0, output_stddev = 0.0;
+  cfl->GetValue("output-mean", &output_mean);
+  cfl->GetValue("output-stddev", &output_stddev);
+  if (!ok || cfl->HasUnusedValues() || output_dim <= 0) {
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  }
+  Vector<BaseFloat> output(output_dim);
+  output.SetRandn();
+  output.Scale(output_stddev);
+  output.Add(output_mean);
+  output_ = output;
+}
+
+int32 ConstantComponent::NumParameters() const {
+  KALDI_ASSERT(is_updatable_);
+  return output_.Dim();
+}
+
+void ConstantComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  params->CopyFromVec(output_);
+}
+
+void ConstantComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  output_.CopyFromVec(params);
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index f389d019522..780ec8466e6 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -580,6 +580,98 @@ class BackpropTruncationComponentPrecomputedIndexes:
   }
 };
 
+
+// ConstantComponent returns a constant value for all requested
+// indexes, and it has no dependencies on any input.
+// It's like a ConstantFunctionComponent, but done the "right"
+// way without requiring an unnecessary input.
+// It is optionally trainable, and optionally you can use natural
+// gradient.
+class ConstantComponent: public UpdatableComponent {
+ public:
+  // actually this component requires no inputs; this value
+  // is really a don't-care.
+  virtual int32 InputDim() const { return output_.Dim(); }
+
+  virtual int32 OutputDim() const { return output_.Dim(); }
+
+  virtual std::string Info() const;
+
+  // possible parameter values with their defaults:
+  // is-updatable=true use-natural-gradient=true output-dim=-1
+  // output-mean=0 output-stddev=0
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  ConstantComponent();
+
+  ConstantComponent(const ConstantComponent &other);
+
+  virtual std::string Type() const { return "ConstantComponent"; }
+  virtual int32 Properties() const {
+    return
+        (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0);
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const {
+    desired_indexes->clear();  // requires no inputs.
+  }
+
+  // This function returns true if at least one of the input indexes used to
+  // compute this output index is computable.
+  // it's simple because this component requires no inputs.
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const {
+    if (used_inputs) used_inputs->clear();
+    return true;
+  }
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+ private:
+
+  // the output value-- a vector.
+  CuVector<BaseFloat> output_;
+
+  bool is_updatable_;
+  // if true, and if updatable, do natural-gradient update.
+  bool use_natural_gradient_;
+  OnlineNaturalGradient preconditioner_;
+
+  const ConstantComponent &operator
+  = (const ConstantComponent &other); // Disallow.
+};
+
+
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index ba7c679cb6c..f8cd39cb06e 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1332,8 +1332,9 @@ class PerElementOffsetComponent: public UpdatableComponent {
 // i.e. its output does not depend on its input.  It is the same as
 // an affine component with the linear term fixed at zero.
 // It is optionally trainable, and optionally you can use natural
-// gradient.  The input is required only because the framework
-// requires components to have an input.
+// gradient.  The input is required only because it's more convenient
+// to make SimpleComponents [but see ConstantComponent, which requires
+// no inputs].
 class ConstantFunctionComponent: public UpdatableComponent {
  public:
   virtual int32 InputDim() const { return input_dim_; }
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 7ab46d1003e..18131aaa213 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -388,6 +388,11 @@ void GenerateConfigSequenceLstm(
 
   os << "input-node name=input dim=" << input_dim << std::endl;
 
+  // trainable cell value for start/end of file.
+  os << "component name=c0 type=ConstantComponent"
+     << " output-dim=" << cell_dim << std::endl;
+
+
   // Parameter Definitions W*(* replaced by - to have valid names)
   // Input gate control : Wi* matrices
   os << "component name=Wi-xr type=NaturalGradientAffineComponent"
@@ -467,7 +472,13 @@ void GenerateConfigSequenceLstm(
   }
   std::string spliced_input = temp_string_stream.str();
 
-  std::string c_tminus1 = "Sum(IfDefined(Offset(c1_t, -1)), IfDefined(Offset( c2_t, -1)))";
+  std::string c_tminus1 = "Sum(Failover(Offset(c1_t, -1), c0), IfDefined(Offset( c2_t, -1)))";
+
+
+  // c0.  note: the input is never used as the component requires
+  // no input indexes; we just write itself as input to keep the
+  // structures happy.
+  os << "component-node name=c0 component=c0 input=c0\n";
 
   // i_t
   os << "component-node name=i1 component=Wi-xr input=Append("

From 2b66d289e1ffc343dcc8af051f35fa97eb790bcb Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 14 Jan 2017 18:58:40 -0500
Subject: [PATCH 053/213] Various refactoring of discriminative training; other
 fixes.

---
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  30 +-
 .../s5/local/chain/run_tdnn_discriminative.sh |   8 +-
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  30 +-
 .../tuning/run_blstm_6h_discriminative.sh     |   8 +-
 .../tuning/run_tdnn_6h_discriminative.sh      |   8 +-
 .../local/nnet3/run_blstm_discriminative.sh   |   8 +-
 .../local/nnet3/run_tdnn_discriminative.sh    |  30 +-
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  30 +-
 egs/tedlium/s5_r2/local/nnet3/compare_wer.sh  |   4 +-
 .../nnet3/tuning/run_tdnn_lstm_1b_disc.sh     | 187 +++++++
 .../s5/local/nnet3/run_lstm_discriminative.sh |  26 +-
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  26 +-
 egs/wsj/s5/steps/nnet3/align.sh               |   5 +-
 egs/wsj/s5/steps/nnet3/chain/train.py         |   5 -
 egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh    |   4 +-
 egs/wsj/s5/steps/nnet3/get_degs.sh            | 499 ++++++++++++++++++
 egs/wsj/s5/steps/nnet3/get_egs.sh             |   2 +-
 .../s5/steps/nnet3/get_egs_discriminative.sh  |  45 +-
 egs/wsj/s5/steps/nnet3/make_denlats.sh        |   5 +-
 .../s5/steps/nnet3/train_discriminative.sh    |  43 +-
 egs/wsj/s5/utils/filter_scps.pl               |   3 +-
 egs/wsj/s5/utils/split_data.sh                |  15 +-
 src/chainbin/nnet3-chain-copy-egs.cc          |   8 +-
 src/nnet3/discriminative-supervision.cc       |  89 ++--
 src/nnet3/discriminative-supervision.h        |  90 ++--
 src/nnet3/nnet-am-decodable-simple.cc         |   4 +-
 src/nnet3/nnet-am-decodable-simple.h          |   5 +-
 src/nnet3/nnet-chain-example.cc               |  22 -
 src/nnet3/nnet-chain-example.h                |   9 -
 src/nnet3/nnet-discriminative-example.cc      |  21 -
 src/nnet3/nnet-discriminative-example.h       |   9 -
 src/nnet3bin/Makefile                         |   2 +-
 .../discriminative-get-supervision.cc         | 100 ----
 src/nnet3bin/nnet3-align-compiled.cc          |  11 +-
 src/nnet3bin/nnet3-discriminative-copy-egs.cc |   9 +-
 src/nnet3bin/nnet3-discriminative-get-egs.cc  |  53 +-
 36 files changed, 958 insertions(+), 495 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/get_degs.sh
 delete mode 100644 src/nnet3bin/discriminative-get-supervision.cc

diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
index 4afa867503a..aa2a845d6a8 100644
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -8,7 +8,7 @@ set -o pipefail
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . ./cmd.sh
 
 
@@ -38,16 +38,15 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.00000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=2
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
+adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
 modify_learning_rates=true
 last_layer_factor=0.1
@@ -57,8 +56,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -89,7 +88,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=100  
+    nj=100
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -102,8 +101,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=$(nnet3-am-info $srcdir/final.mdl | grep "^left-context:" | awk '{print $2}')
+model_right_context=$(nnet3-am-info $srcdir/final.mdl | grep "^right-context:" | awk '{print $2}')
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -113,7 +112,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -126,15 +125,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -147,9 +144,9 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 graph_dir=exp/tri5a/graph_fsh_sw1_tg
@@ -159,7 +156,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg_$iter ;
@@ -181,4 +178,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
index f4d40884058..aeb0a7164e2 100755
--- a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
@@ -44,7 +44,6 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.000001
@@ -166,15 +165,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $train_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -187,7 +184,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" --use-frame-shift false \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --adjust-priors false \
       ${degs_dir} $dir ;
 fi
 
@@ -227,4 +224,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
index 81732779d37..51caba2bc98 100755
--- a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -10,7 +10,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 
 
 stage=0
@@ -45,16 +45,15 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.00000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
+adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
 modify_learning_rates=true
 last_layer_factor=0.1
@@ -64,8 +63,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -96,7 +95,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -109,8 +108,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -120,7 +119,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -133,15 +132,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors $adjust_priors \
       --online-ivector-dir $train_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -154,9 +151,9 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 if [ $stage -le 5 ]; then
@@ -166,7 +163,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_tgsmall_$iter || exit 1
@@ -194,4 +191,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
index e3884df8711..8c46f4b5d07 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
@@ -45,7 +45,6 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.000000125
@@ -173,15 +172,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -194,7 +191,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" --use-frame-shift false \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --adjust-priors false \
     --modify-learning-rates false \
       ${degs_dir} $dir ;
 fi
@@ -231,4 +228,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
index 9a7c4ca2859..25c6841c0a9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
@@ -78,7 +78,6 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.000000125
@@ -200,15 +199,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -221,7 +218,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" --use-frame-shift false \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --adjust-priors false \
     --modify-learning-rates false \
       ${degs_dir} $dir ;
 fi
@@ -258,4 +255,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index 99f6a31e708..2f5badba26c 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -44,7 +44,6 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
@@ -138,15 +137,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -159,7 +156,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
@@ -195,4 +192,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
index f422aa92e38..91bcaf06ccb 100755
--- a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
@@ -8,7 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . cmd.sh
 
 
@@ -38,16 +38,15 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
+adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
 modify_learning_rates=true
 last_layer_factor=0.1
@@ -57,8 +56,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -89,7 +88,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -102,8 +101,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -113,7 +112,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -126,15 +125,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -147,9 +144,9 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 graph_dir=exp/tri4/graph_sw1_tg
@@ -159,7 +156,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
@@ -183,4 +180,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
index a5b80505393..0c5e05556ad 100755
--- a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -4,7 +4,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 
 #%WER 13.3 | 507 17792 | 89.1 8.2 2.8 2.4 13.3 86.0 | -0.207 | exp/nnet3/tdnn_smbr/decode_dev_epoch1.adj/score_12_1.0/ctm.filt.filt.sys
 #%WER 12.4 | 507 17792 | 89.8 7.5 2.7 2.2 12.4 85.4 | -0.305 | exp/nnet3/tdnn_smbr/decode_dev_epoch1.adj_rescore/score_12_1.0/ctm.filt.filt.sys
@@ -52,16 +52,15 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
+adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
 modify_learning_rates=true
 last_layer_factor=0.1
@@ -71,8 +70,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -103,7 +102,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -116,15 +115,15 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -137,15 +136,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -158,9 +155,9 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 graph_dir=exp/tri3/graph
@@ -170,7 +167,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${iter:+_$iter} || exit 1;
@@ -194,4 +191,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
index 012ea702427..6aff556c142 100755
--- a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
@@ -36,14 +36,14 @@ done
 
 echo -n "Final train prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.final.log | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.combined.log | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
 echo -n "Final valid prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.final.log | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
new file mode 100755
index 00000000000..b1f7e6f8c93
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+# This script does discriminative training on top of CE nnet3 system.  To
+# simplify things, this assumes you are using the "cleaned" data (since this is
+# generally better), i.e. it won't work if you used options to run_tdnn_lstm_1b.sh
+# to use the non-cleaned data.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+
+# how to run this (where $0 is the name of this script)
+# by default, with the "cleaned" data:
+# $0
+
+# without the "cleaned" data:
+# $0 --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+# nj=400 # have a high number of jobs because this could take a while, and we might
+#         # have some stragglers.
+nj=30
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3_cleaned/tdnn_lstm1b_sp
+#train_data_dir=data/train_cleaned_sp_hires_comb
+#online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
+
+train_data_dir=data/dev_hires
+online_ivector_dir=exp/nnet3_cleaned/ivectors_dev_hires
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
+disc_affix=
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_decoding=200
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+extra_left_context=40
+extra_right_context=0
+looped=true # affects alignments; because it's an LSTM, would be false for pure TDNNs or BLSTMs.
+
+
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
+minibatch_size=64             # we may have to reduce this.
+adjust_priors=true            # May need to be set to false
+                              # because it does not help in some setups
+modify_learning_rates=true
+last_layer_factor=0.1         # prevent the final layer from learning too fast;
+                              # this can be a problem.
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_decoding \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --looped $looped \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
+      --stage $get_egs_stage \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+exit 0 # TODO: remove this
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir
+fi
+
+graph_dir=exp/tri3/graph
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${iter:+_$iter} || exit 1;
+
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test data/lang_rescore data/${decode_set}_hires \
+        $dir/decode_${decode_set}${iter:+_$iter} \
+        $dir/decode_${decode_set}${iter:+_$iter}_rescore || exit 1;
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
index 0b6d7bb3970..b91208a0fe6 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -8,7 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . cmd.sh
 
 
@@ -46,14 +46,13 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 adjust_priors=true
 modify_learning_rates=true
@@ -64,8 +63,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -104,7 +103,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -117,8 +116,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -128,7 +127,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -141,15 +140,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -162,9 +159,9 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 if [ $stage -le 5 ]; then
@@ -194,4 +191,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
index a514e354eef..45bb36ea85c 100755
--- a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -8,7 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . cmd.sh
 
 
@@ -38,14 +38,13 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 adjust_priors=true
 modify_learning_rates=true
@@ -56,8 +55,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -88,7 +87,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -101,8 +100,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -112,7 +111,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -125,15 +124,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -146,9 +143,9 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 if [ $stage -le 5 ]; then
@@ -178,4 +175,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index 713ecc128da..fdf8130ec62 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -49,8 +49,9 @@ dir=$4
 oov=`cat $lang/oov.int` || exit 1;
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
-sdata=$data/split$nj
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+sdata=$data/split${nj}utt
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
+   split_data.sh --per-utt $data $nj || exit 1;
 
 if $use_gpu; then
   queue_opt="--gpu 1"
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 42d302c34a0..374e1036f00 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -86,11 +86,6 @@ def get_args():
                         action=common_lib.StrToBoolAction,
                         choices=["true", "false"],
                         help="")
-    parser.add_argument("--chain.truncate-deriv-weights", type=float,
-                        dest='truncate_deriv_weights', default=0,
-                        help="""Can be used to set to zero the weights of
-                        derivs from frames near the edges.  (counts subsampled
-                        frames)""")
     parser.add_argument("--chain.frame-subsampling-factor", type=int,
                         dest='frame_subsampling_factor', default=3,
                         help="ratio of frames-per-second of features we "
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index ada92e66ff4..cb1d7d1c357 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -20,8 +20,6 @@ num_epochs=10      # Number of epochs of training;
                    # Be careful with this: we actually go over the data
                    # num-epochs * frame-subsampling-factor times, due to
                    # using different data-shifts.
-truncate_deriv_weights=0  # can be used to set to zero the weights of derivs from frames
-                          # near the edges.  (counts subsampled frames).
 apply_deriv_weights=true
 initial_effective_lrate=0.0002
 final_effective_lrate=0.00002
@@ -530,7 +528,7 @@ while [ $x -lt $num_iters ]; do
               $this_cache_io_opts $parallel_train_opts $deriv_time_opts \
              --max-param-change=$this_max_param_change \
             --print-interval=10 "$mdl" $dir/den.fst \
-          "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights=$truncate_deriv_weights --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
+          "ark,bg:nnet3-chain-copy-egs --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
new file mode 100755
index 00000000000..cc3ab5c4b13
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -0,0 +1,499 @@
+#!/bin/bash
+
+# Copyright 2012-2016   Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2014-2015   Vimal Manohar
+
+# Decodes denlats and dumps egs for discriminative training, in one script
+# (avoids writing the non-compact lattices to disk, which can use a lot of disk
+# space).
+
+
+# Begin configuration section.
+cmd=run.pl
+max_copy_jobs=5  # Limit disk I/O
+
+# feature options
+feat_type=raw     # set it to 'lda' to use LDA features.
+transform_dir= # If this is a SAT system, directory for transforms
+online_ivector_dir=
+
+# example splitting and context options
+frames_per_eg=150 # number of frames of labels per example.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations; the first one (the principal num-frames) is preferred.
+frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
+                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
+                  # and --right-deriv-truncate.
+looped=false       # Set to true to enable looped decoding [can
+                   # be a bit faster, for forward-recurrent models like LSTMs.]
+
+# .. these context options also affect decoding.
+extra_left_context=0    # amount of left-context per eg, past what is required by the model
+                        # (only useful for recurrent networks like LSTMs/BLSTMs)
+extra_right_context=0   # amount of right-context per eg, past what is required by the model
+                        # (only useful for backwards-recurrent networks like BLSTMs)
+extra_left_context_initial=-1    # if >= 0, the --extra-left-context to use at
+                                 # the start of utterances.  Recommend 0 if you
+                                 # used 0 for the baseline DNN training; if <0,
+                                 # defaults to same as extra_left_context
+extra_right_context_final=-1     # if >= 0, the --extra-right-context to use at
+                                 # the end of utterances.  Recommend 0 if you
+                                 # used 0 for the baseline DNN training; if <0,
+                                 # defaults to same as extra_left_context
+
+compress=true   # set this to false to disable lossy compression of features
+                # dumped with egs (e.g. if you want to see whether results are
+                # affected).
+
+num_utts_subset=80     # number of utterances in validation and training
+                       # subsets used for diagnostics.
+num_egs_subset=800     # number of egs (maximum) for the validation and training
+                       # subsets used for diagnostics.
+frames_per_iter=400000 # each iteration of training, see this many frames
+                       # per job.  This is just a guideline; it will pick a number
+                       # that divides the number of samples in the entire data.
+cleanup=true
+
+stage=0
+nj=200
+
+# By default this script uses final.mdl in <srcdir>, this configures it.
+iter=final
+
+
+# decoding-graph option
+self_loop_scale=0.1  # for decoding graph.. should be 1.0 for chain models.
+
+# options relating to decoding.
+frames_per_chunk_decoding=150
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+min_active=200
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+num_threads=1
+
+# affects whether we invoke lattice-determinize-non-compact after decoding
+# discriminative-get-supervision.
+determinize_before_split=true
+
+
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 5 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <src-dir> <ali-dir> <degs-dir>"
+  echo " e.g.: $0 data/train data/lang exp/nnet3/tdnn_a exp/nnet3/tdnn_a_ali exp/nnet3/tdnn_a_degs"
+  echo ""
+  echo "For options, see top of script file.  Standard options:"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
+  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
+  echo "                                                   # online-neural-net setup."
+  echo "  --nj <nj|200>                                    # number of jobs to submit to the queue."
+  echo "  --num-threads <n|1>                              # number of threads per decoding job"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+alidir=$4
+dir=$5
+
+
+extra_files=
+[ ! -z $online_ivector_dir ] && \
+  extra_files="$extra_files $online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"
+[ "$feat_type" = "lda" ] && \
+  extra_files="$extra_files $srcdir/final.mat"
+[ ! -z $transform_dir ] && \
+  extra_files="$extra_files $transform_dir/trans.1 $transform_dir/num_jobs"
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $srcdir/${iter}.mdl $srcdir/tree \
+      $srcdir/cmvn_opts $alidir/ali.1.gz $alidir/num_jobs $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log $dir/info || exit 1;
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
+utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+
+
+
+utils/split_data.sh --per-utt $data $nj
+sdata=$data/split${nj}utt
+
+
+## Set up features.
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"
+
+
+cmvn_opts=$(cat $srcdir/cmvn_opts) || exit 1
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+   ;;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+fi
+
+
+## set iVector options
+if [ ! -z "$online_ivector_dir" ]; then
+  online_ivector_period=$(cat $online_ivector_dir/ivector_period)
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$online_ivector_period"
+fi
+
+## set frame-subsampling-factor option and copy file
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) || exit 1
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+  cp $srcdir/frame_subsampling_factor $dir
+else
+  frame_subsampling_factor=1
+fi
+
+
+## Make the decoding graph.
+if [ $stage -le 0 ]; then
+  new_lang="$dir/"$(basename "$lang")
+  rm -r $new_lang 2>/dev/null
+  cp -rH $lang $dir
+  echo "$0: Making unigram grammar FST in $new_lang"
+  oov=$(cat data/lang/oov.txt)
+  cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+   awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+    utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
+    || exit 1;
+
+  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+fi
+
+# copy alignments into ark,scp format which allows us to use different num-jobs
+# from the alignment, and is also convenient for getting priors.
+if [ $stage -le 1 ]; then
+  echo "$0: Copying input alignments"
+  nj_ali=$(cat $alidir/num_jobs)
+  alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done)
+  $cmd $dir/log/copy_alignments.log \
+     copy-int-vector "ark:gunzip -c $alis|" \
+     ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+fi
+
+[ -f $dir/ali.scp ] || { echo "$0: expected $dir/ali.scp to exist"; exit 1; }
+
+if [ $stage -le 2 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s:JOB:1:g)"
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without stderr redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
+fi
+
+# copy the model to the degs directory.
+cp $srcdir/${iter}.mdl $dir/final.mdl || exit 1
+
+# Create some info in $dir/info
+
+# Work out total number of archives. Add one on the assumption the
+# num-frames won't divide exactly, and we want to round up.
+num_archives=$[num_frames/frames_per_iter+1]
+
+echo $num_archives >$dir/info/num_archives
+echo $frame_subsampling_factor >$dir/info/frame_subsampling_factor
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done)
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.scp; done)
+  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.ark; done)
+  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.scp; done)
+  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig_filtered.$y.scp; done)
+fi
+
+
+extra_context_opts="--extra-left-context=$extra_left_context --extra-right-context=$extra_right_context --extra-left-context-initial=$extra_left_context_initial --extra-right-context-final=$extra_right_context_final"
+
+# work out absolute context opts, --left-context and so on [need model context]
+model_left_context=$(nnet3-am-info $srcdir/${iter}.mdl | grep "^left-context:" | awk '{print $2}')
+model_right_context=$(nnet3-am-info $srcdir/${iter}.mdl | grep "^right-context:" | awk '{print $2}')
+left_context=$[model_left_context+extra_left_context+frame_subsampling_factor/2]
+right_context=$[model_right_context+extra_right_context+frame_subsampling_factor/2]
+context_opts="--left-context=$left_context --right-context=$right_context"
+if [ $extra_left_context_initial -ge 0 ]; then
+  left_context_initial=$[model_left_context+extra_left_context_initial+frame_subsampling_factor/2]
+  context_opts="$context_opts --left-context-initial=$left_context_initial"
+fi
+if [ $extra_right_context_final -ge 0 ]; then
+  right_context_final=$[model_right_context+extra_right_context_final+frame_subsampling_factor/2]
+  context_opts="$context_opts --right-context-final=$right_context_final"
+fi
+
+##
+if [ $num_threads -eq 1 ]; then
+  if $looped; then
+    decoder="nnet3-latgen-faster-looped"
+    [ $extra_left_context_initial -ge 0 ] && \
+      decoder="$decoder --extra-left-context-initial=$extra_left_context_initial"
+  else
+    decoder="nnet3-latgen-faster $extra_context_opts"
+  fi
+  threads_cmd_opt=
+else
+  $looped && { echo "$0: --num-threads must be one if you use looped decoding"; exit 1; }
+  threads_cmd_opt="--num-threads $num_threads"
+  decoder="nnet3-latgen-faster-parallel --num-threads=$num_threads $extra_context_opts"
+  true
+fi
+
+# set the command to determinize lattices, if specified.
+if $determinize_before_split; then
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --prune --beam=$lattice_beam ark:- ark:-"
+else
+  lattice_determinize_cmd="cat"
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: decoding and dumping egs"
+  $cmd $threads_cmd_opt JOB=1:$nj $dir/log/decode_and_get_egs.JOB.log \
+     $decoder \
+     $ivector_opts $frame_subsampling_opt \
+    --frames-per-chunk=$frames_per_chunk_decoding \
+    --determinize-lattice=false \
+    --max-active=$max_active --min-active=$min_active --beam=$beam \
+    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
+    --word-symbol-table=$lang/words.txt $dir/final.mdl  \
+    $dir/dengraph/HCLG.fst "$feats" ark:- \| \
+    $lattice_determinize_cmd  \| \
+    nnet3-discriminative-get-egs --acoustic-scale=$acwt --compress=$compress \
+      --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg \
+      $ivector_opts $context_opts \
+      $dir/final.mdl "$feats"  "ark,s,cs:-" \
+      "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
+      ark,scp:$dir/degs_orig.JOB.ark,$dir/degs_orig.JOB.scp || exit 1
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: getting validation utterances."
+
+  ## Get list of validation utterances.
+  awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
+   > $dir/valid_uttlist || exit 1;
+
+  if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+    echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+    echo "include all perturbed versions of the same 'real' utterances."
+    mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+    utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+    cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+      sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+      awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+    rm $dir/uniq2utt $dir/valid_uttlist.tmp
+  fi
+
+  # the following awk statement turns 'foo123' into something like
+  # '^foo123-[0-9]\+ ' which is a grep expression that matches the lines in the
+  # .scp file that correspond to an utterance in valid_uttlist.
+  cat $dir/valid_uttlist | awk '{printf("^%s-[0-9]\\+ \n", $1);}' \
+     >$dir/valid_uttlist.regexps || exit 1
+
+  # remove the validation utterances from deg_orig.*.scp to produce
+  # degs_orig_filtered.*.scp.
+  # note: the '||' true is in case the grep returns nonzero status for
+  # some splits, because they were all validation utterances.
+  $cmd JOB=1:$nj $dir/log/filter_and_shuffle.JOB.log \
+     grep -v -f $dir/valid_uttlist.regexps $dir/degs_orig.JOB.scp '>' \
+     $dir/degs_orig_filtered.JOB.scp '||' true || exit 1
+
+  # extract just the validation utterances from deg_orig.*.scp to produce
+  # degs_valid.*.scp.
+  $cmd JOB=1:$nj $dir/log/extract_validation_egs.JOB.log \
+    grep -f $dir/valid_uttlist.regexps $dir/degs_orig.JOB.scp '>' \
+    $dir/degs_valid.JOB.scp '||' true || exit 1
+
+  for j in $(seq $nj); do
+    cat $dir/degs_valid.$j.scp; rm $dir/degs_valid.$j.scp;
+  done | utils/shuffle_list.pl | head -n$num_utts_subset >$dir/valid_diagnostic.scp || exit 1
+
+  [ -s $dir/valid_diagnostic.scp ] || { echo "$0: error getting validation egs"; exit 1; }
+fi
+
+
+# read 'mof' as max_open_filehandles.
+# When splitting up the scp files, we don't want to have to hold too many
+# files open at once.
+mof=$(ulimit -n) || exit 1
+# the next step helps work around inconsistency between different machines on a
+# cluster.  It's unlikely that the allowed number of open filehandles would ever
+# be less than 256.
+if [ $mof -gt 256 ]; then mof=256; fi
+# allocate mof minus 3 for the max allowed outputs, because of
+# stdin,stderr,stdout.  this will normally come to 253.  We'll do a two-stage
+# splitting if the needed number of scp files is larger than this.
+num_groups=$[(num_archives+(mof-3)-1)/(mof-3)]
+group_size=$[(num_archives+num_groups-1)/num_groups]
+if [ $num_groups -gt 1 ]; then
+  new_num_archives=$[group_size*num_groups]
+  [ $new_num_archives -ne $num_archives ] && \
+    echo "$0: rounding up num-archives from $num_archives to $new_num_archives for easier splitting"
+  echo $new_num_archives >$dir/info/num_archives
+fi
+
+
+# function/pseudo-command to randomly shuffle input lines using a small buffer size
+function shuffle {
+    perl -e ' use List::Util qw(shuffle); srand(0);
+       $bufsz=1000; @A = (); while(<STDIN>) { push @A, $_; if (@A == $bufsz) {
+       $n=int(rand()*$bufsz); print $A[$n]; $A[$n] = $A[$bufsz-1]; pop @A; }}
+       @A = shuffle(@A); print @A; '
+  }
+# funtion/pseudo-command to put input lines round robin to command line args.
+function round_robin {
+  perl -e '@F=(); foreach $a (@ARGV) { my $f; open($f, ">$a") || die "opening file $a"; push @F, $f; }
+         $N=@F; $N>0||die "No output files"; $n=0;
+         while (<STDIN>) { $fh=$F[$n%$N]; $n++; print $fh $_ || die "error printing"; } ' $*
+}
+
+
+if [ $stage -le 5 ]; then
+  echo "$0: rearranging scp files"
+
+  if [ $num_groups -eq 1 ]; then
+    # output directly to the archive files.
+    outputs=$(for n in $(seq $num_archives); do echo $dir/degs.$n.scp; done)
+  else
+    # output to intermediate 'group' files.
+    outputs=$(for g in $(seq $num_groups); do echo $dir/degs_group.$g.scp; done)
+  fi
+
+  # We can't use UNIX's split command because of compatibility issues (BSD
+  # version very different from GNU version), so we use 'round_robin' which is
+  # a bash function that calls an inline perl script.
+  for j in $(seq $nj); do cat $dir/degs_orig_filtered.$j.scp; done | \
+    shuffle | round_robin $outputs || exit 1
+
+  if [ $num_groups -gt 1 ]; then
+    for g in $(seq $num_groups); do
+      first=$[1+group_size*(g-1)]
+      last=$[group_size*g]
+      outputs=$(for n in $(seq $first $last); do echo $dir/degs.$n.scp; done)
+      cat $dir/degs_group.$g.scp | shuffle | round_robin $outputs
+    done
+  fi
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: getting train-subset scp"
+  # get degs_train_subset.scp by taking the top and tail of the degs files [quicker
+  # than cat'ing all the files, random shuffling and head]
+
+  nl=$[$num_egs_subset/$num_archives + 1]
+
+  # use utils/shuffle_list.pl because it provides a complete shuffle (ok since
+  # the amount of data is small).  note: shuf is not available on mac by
+  # default.
+  for n in $(seq $num_archives); do
+    head -n$nl $dir/degs.$n.scp;  tail -n$nl $dir/degs.$n.scp
+  done  | utils/shuffle_list.pl | head -n$num_utts_subset >$dir/train_diagnostic.scp
+  [ -s $dir/train_diagnostic.scp ] || { echo "$0: error getting train_diagnostic.scp"; exit 1; }
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating final archives"
+  $cmd --max-jobs-run "$max_copy_jobs" \
+     JOB=1:$num_archives $dir/log/copy_archives.JOB.log \
+     nnet3-discriminative-copy-egs scp:$dir/degs.JOB.scp ark:$dir/degs.JOB.ark || exit 1
+
+  run.pl $dir/log/copy_train_subset.log \
+      nnet3-discriminative-copy-egs scp:$dir/train_diagnostic.scp \
+         ark:$dir/train_diagnostic.ark  || exit 1
+
+  run.pl $dir/log/copy_valid_subset.log \
+      nnet3-discriminative-copy-egs scp:$dir/valid_diagnostic.scp \
+         ark:$dir/valid_diagnostic.ark  || exit 1
+fi
+
+if [ $stage -le 10 ] && $cleanup; then
+  echo "$0: cleaning up temporary files."
+  for j in $(seq $nj); do
+    for f in $dir/degs_orig.$j.{ark,scp} $dir/degs_orig_filtered.$j.scp; do
+      [ -L $f ] && rm $(readlink -f $f); rm $f
+    done
+  done
+  rm $dir/degs_group.*.scp $dir/valid_diagnostic.scp $dir/train_diagnostic.scp 2>/dev/null
+  rm $dir/ali.ark $dir/ali.scp 2>/dev/null
+  for n in $(seq $num_archives); do
+    for f in $dir/degs.$n.scp; do
+      [ -L $f ] && rm $(readlink -f $f); rm $f
+    done
+  done
+fi
+
+
+exit 0
+
+
+echo "$0: Finished decoding and preparing training examples"
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 27877680982..cb7ea0ac73c 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 #
 # This script, which will generally be called from other neural-net training
 # scripts, extracts the training examples used to train the neural net (and also
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index fd616160632..377c49fc5cb 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -1,8 +1,11 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2012-2016   Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 # Copyright 2014-2015   Vimal Manohar
 
+# Note: you may find it more convenient to use the newer script get_degs.sh, which
+# combines decoding and example-creation in one step without writing lattices.
+
 # This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
 # training of neural nets.
 # Criterion supported are mpe, smbr and mmi
@@ -12,6 +15,8 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=150 # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations; the first one (the principal num-frames) is preferred.
 frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
@@ -32,11 +37,6 @@ frames_per_iter=400000 # each iteration of training, see this many frames
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
 
-determinize=true
-minimize=true
-remove_output_symbols=true
-remove_epsilons=true
-collapse_transition_ids=true
 acwt=0.1
 
 stage=0
@@ -225,7 +225,7 @@ if [ $stage -le 2 ]; then
   fi
 fi
 
-# Working out total number of archives. Add one on the assumption the
+# Work out total number of archives. Add one on the assumption the
 # num-frames won't divide exactly, and we want to round up.
 num_archives=$[$num_frames/$frames_per_iter+1]
 
@@ -244,8 +244,14 @@ num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;
 
 echo $num_archives >$dir/info/num_archives
 echo $frames_per_eg >$dir/info/frames_per_eg
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
 # Work out the number of egs per archive
-egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
+egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)] || exit 1;
 ! [ $egs_per_archive -le $frames_per_iter ] && \
   echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
   && exit 1;
@@ -279,7 +285,6 @@ if [ $stage -le 3 ]; then
   for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp
 fi
 
-splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt"
 
 
 # If frame_subsampling_factor > 0, we will later be shifting the egs slightly to
@@ -291,7 +296,7 @@ right_context=$[right_context+frame_subsampling_factor/2]
 [ $left_context_initial -ge 0 ] && left_context_initial=$[left_context_initial+frame_subsampling_factor/2]
 [ $right_context_final -ge 0 ] && right_context_final=$[right_context_final+frame_subsampling_factor/2]
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress --frame-subsampling-factor=$frame_subsampling_factor $splitter_opts"
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress --frame-subsampling-factor=$frame_subsampling_factor --acoustic-scale=$acwt"
 [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
 [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
@@ -305,8 +310,6 @@ priors_egs_opts="--left-context=$left_context --right-context=$right_context --n
 [ $right_context_final -ge 0 ] && priors_egs_opts="$priors_egs_opts --right-context-final=$right_context_final"
 
 
-supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor"
-
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
 echo $left_context_initial > $dir/info/left_context_initial
@@ -368,16 +371,14 @@ if [ $stage -le 4 ]; then
     <$dir/ali.scp >$dir/ali_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    discriminative-get-supervision $supervision_all_opts \
-    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
     nnet3-discriminative-get-egs $ivector_opts $egs_opts \
-    $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
+    $dir/final.mdl "$valid_feats" scp:$dir/lat_special.scp \
+    scp:$dir/ali_special.scp "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset.log \
-    discriminative-get-supervision $supervision_all_opts \
-    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
     nnet3-discriminative-get-egs $ivector_opts $egs_opts \
-    $dir/final.mdl "$train_subset_feats" ark,s,cs:- "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
+    $dir/final.mdl "$train_subset_feats" scp:$dir/lat_special.scp \
+    scp:$dir/ali_special.scp  "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
@@ -403,12 +404,10 @@ if [ $stage -le 5 ]; then
   # files is the product of 'nj' by 'num_archives_intermediate', which might be
   # quite large.
   $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
-    discriminative-get-supervision $supervision_all_opts \
-    "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
-    "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \
     nnet3-discriminative-get-egs $ivector_opts $egs_opts \
-       --num-frames-overlap=$frames_overlap_per_eg \
-    $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \
+      --num-frames-overlap=$frames_overlap_per_eg \
+      $dir/final.mdl "$feats" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz |" \
+      "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" ark:- \| \
     nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1;
 fi
 
diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh
index 7bc8dbd8c08..d1591c0b1de 100755
--- a/egs/wsj/s5/steps/nnet3/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh
@@ -22,7 +22,7 @@ transform_dir=
 max_mem=20000000 # This will stop the processes getting too large.
 # This is in bytes, but not "real" bytes-- you have to multiply
 # by something like 5 or 10 to get real bytes (not sure why so large)
-num_threads=1 # Fixed to 1 for now
+num_threads=1 # number of threads of decoder [only applicable if not looped, for now]
 online_ivector_dir=
 determinize=true
 minimize=false
@@ -174,7 +174,7 @@ fi
 
 lattice_determinize_cmd=
 if $determinize; then
-  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$beam ark:- ark:- |"
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$lattice_beam ark:- ark:- |"
 fi
 
 if [ $sub_split -eq 1 ]; then
@@ -248,4 +248,3 @@ fi
 
 
 echo "$0: done generating denominator lattices."
-
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index b0bf2a2aad6..fb75e7b0aab 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -17,8 +17,6 @@ num_epochs=4       # Number of epochs of training;
                    # num-epochs * frame-subsampling-factor times, due to
                    # using different data-shifts.
 use_gpu=true
-truncate_deriv_weights=0  # can be used to set to zero the weights of derivs from frames
-                          # near the edges.  (counts subsampled frames).
 apply_deriv_weights=true
 use_frame_shift=false
 run_diagnostics=true
@@ -50,7 +48,8 @@ shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of
 
 stage=-3
 
-adjust_priors=true
+adjust_priors=true   # If true then it will
+
 num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                 # using GPUs.
 
@@ -59,8 +58,9 @@ keep_model_iters=1
 remove_egs=false
 src_model=  # will default to $degs_dir/final.mdl
 
-left_deriv_truncate=   # number of time-steps to avoid using the deriv of, on the left.
-right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on the right.
+
+min_deriv_time=0
+max_deriv_time_relative=0
 # End configuration section.
 
 
@@ -71,7 +71,7 @@ if [ -f path.sh ]; then . ./path.sh; fi
 
 
 if [ $# != 2 ]; then
-  echo "Usage: $0 [opts] <degs-dir> <exp-dir>"
+  echo "Usage: $0 [opts] <degs-dir> <src-model> <exp-dir>"
   echo " e.g.: $0 exp/nnet3/tdnn_sp_degs exp/nnet3/tdnn_sp_smbr"
   echo ""
   echo "Main options (for others, see top of script file)"
@@ -109,12 +109,18 @@ dir=$2
 [ -z "$src_model" ] && src_model=$degs_dir/final.mdl
 
 # Check some files.
-for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_eg,egs_per_archive} $src_model; do
+for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frame_subsampling_factor} $src_model; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
 mkdir -p $dir/log || exit 1;
 
+
+model_left_context=$(nnet3-am-info $src_model | grep "^left-context:" | awk '{print $2}')
+model_right_context=$(nnet3-am-info $src_model | grep "^right-context:" | awk '{print $2}')
+
+
+
 # copy some things
 for f in splice_opts cmvn_opts tree final.mat; do
   if [ -f $degs_dir/$f ]; then
@@ -129,7 +135,6 @@ if $adjust_priors; then
   num_archives_priors=`cat $degs_dir/info/num_archives_priors` || exit 1
 fi
 
-frames_per_eg=$(cat $degs_dir/info/frames_per_eg) || { echo "error: no such file $degs_dir/info/frames_per_eg"; exit 1; }
 num_archives=$(cat $degs_dir/info/num_archives) || exit 1;
 frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor)
 
@@ -201,12 +206,7 @@ fi
 
 rm $dir/.error 2>/dev/null
 
-x=0   
-
-deriv_time_opts=
-[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate"
-[ ! -z "$right_deriv_truncate" ] && \
-  deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))"
+x=0
 
 while [ $x -lt $num_iters ]; do
   if [ $stage -le $x ]; then
@@ -229,7 +229,7 @@ while [ $x -lt $num_iters ]; do
         $dir/$x.mdl \
         ark:$degs_dir/train_diagnostic.degs &
     fi
-    
+
     if [ $x -gt 0 ]; then
       $cmd $dir/log/progress.$x.log \
         nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
@@ -239,9 +239,9 @@ while [ $x -lt $num_iters ]; do
 
 
     echo "Training neural net (pass $x)"
-      
+
     cache_read_opt="--read-cache=$dir/cache.$x"
-    
+
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
@@ -253,7 +253,7 @@ while [ $x -lt $num_iters ]; do
         k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                                # the other indexes from.
         archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
-        
+
         if [ $n -eq 1 ]; then
           # an option for writing cache (storing pairs of nnet-computations and
           # computation-requests) during training.
@@ -282,14 +282,16 @@ while [ $x -lt $num_iters ]; do
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
           nnet3-discriminative-train $cache_read_opt $cache_write_opt \
           --apply-deriv-weights=$apply_deriv_weights \
-          $parallel_train_opts $deriv_time_opts \
+          --optimization.min-deriv-time=-$model_left_context \
+          --optimization.max-deriv-time-relative=$model_right_context \
+            $parallel_train_opts \
           --max-param-change=$this_max_param_change \
           --silence-phones=$silphonelist \
           --criterion=$criterion --drop-frames=$drop_frames \
           --one-silence-class=$one_silence_class \
           --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \
           $dir/$x.mdl \
-          "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift --truncate-deriv-weights=$truncate_deriv_weights ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
+          "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
@@ -378,4 +380,3 @@ if $cleanup; then
     fi
   done
 fi
-
diff --git a/egs/wsj/s5/utils/filter_scps.pl b/egs/wsj/s5/utils/filter_scps.pl
index 0d9e0fe4837..07e59d6ba80 100755
--- a/egs/wsj/s5/utils/filter_scps.pl
+++ b/egs/wsj/s5/utils/filter_scps.pl
@@ -165,6 +165,5 @@
   print STDERR "filter_scps.pl: warning: some input lines did not get output\n";
 }
 if ($warn_multiply_covered && $print_warnings) {
-  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files\n";
+  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt]\n";
 }
-
diff --git a/egs/wsj/s5/utils/split_data.sh b/egs/wsj/s5/utils/split_data.sh
index e44a4ab6359..ab0dbbf35c7 100755
--- a/egs/wsj/s5/utils/split_data.sh
+++ b/egs/wsj/s5/utils/split_data.sh
@@ -41,6 +41,14 @@ if ! [ "$numsplit" -gt 0 ]; then
   exit 1;
 fi
 
+if $split_per_spk; then
+  warning_opt=
+else
+  # suppress warnings from filter_scps.pl about 'some input lines were output
+  # to multiple files'.
+  warning_opt="--no-warn"
+fi
+
 n=0;
 feats=""
 wavs=""
@@ -124,9 +132,6 @@ done
 # split some things that are indexed by speaker
 for f in spk2gender spk2warp cmvn.scp; do
   if [ -f $data/$f ]; then
-    ! $split_per_spk && warning_opt="--no-warn"
-    # suppress warnings from filter_scps.pl about 'some input lines were output
-    # to multiple files', which is expected in this case.
     utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
       $data/split${numsplit}${utt}/JOB/spk2utt $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
   fi
@@ -140,12 +145,12 @@ if [ -f $data/segments ]; then
     awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids.
   done
   if [ -f $data/reco2file_and_channel ]; then
-    utils/filter_scps.pl JOB=1:$numsplit \
+    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
       $data/split${numsplit}${utt}/JOB/tmp.reco $data/reco2file_and_channel \
       $data/split${numsplit}${utt}/JOB/reco2file_and_channel || exit 1
   fi
   if [ -f $data/wav.scp ]; then
-    utils/filter_scps.pl JOB=1:$numsplit \
+    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
       $data/split${numsplit}${utt}/JOB/tmp.reco $data/wav.scp \
       $data/split${numsplit}${utt}/JOB/wav.scp || exit 1
   fi
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 1396932252a..fddaa6c9952 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -265,7 +265,6 @@ int main(int argc, char *argv[]) {
     bool random = false;
     int32 srand_seed = 0;
     int32 frame_shift = 0;
-    int32 truncate_deriv_weights = 0;
     int32 frame_subsampling_factor = -1;
     BaseFloat keep_proportion = 1.0;
     int32 left_context = -1, right_context = -1;
@@ -282,9 +281,6 @@ int main(int argc, char *argv[]) {
                 "in the supervision data (excluding iVector data) - useful in "
                 "augmenting data.  Note, the outputs will remain at the closest "
                 "exact multiples of the frame subsampling factor");
-    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
-                "If nonzero, the number of initial/final subsample frames that "
-                "will have their derivatives' weights set to zero.");
     po.Register("left-context", &left_context, "Can be used to truncate the "
                 "feature left-context that we output.");
     po.Register("right-context", &right_context, "Can be used to truncate the "
@@ -320,7 +316,7 @@ int main(int argc, char *argv[]) {
       // count is normally 1; could be 0, or possibly >1.
       int32 count = GetCount(keep_proportion);
       std::string key = example_reader.Key();
-      if (frame_shift == 0 && truncate_deriv_weights == 0 &&
+      if (frame_shift == 0 &&
           left_context == -1 && right_context == -1) {
         const NnetChainExample &eg = example_reader.Value();
         for (int32 c = 0; c < count; c++) {
@@ -338,8 +334,6 @@ int main(int argc, char *argv[]) {
                                     frame_subsampling_factor, &eg_out);
         else
           eg_out.Swap(&eg);
-        if (truncate_deriv_weights != 0)
-          TruncateDerivWeights(truncate_deriv_weights, &eg_out);
         for (int32 c = 0; c < count; c++) {
           int32 index = (random ? Rand() : num_written) % num_outputs;
           example_writers[index]->Write(key, eg_out);
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
index 223257e5a5f..94a165f4c50 100644
--- a/src/nnet3/discriminative-supervision.cc
+++ b/src/nnet3/discriminative-supervision.cc
@@ -24,14 +24,11 @@
 namespace kaldi {
 namespace discriminative {
 
-void DiscriminativeSupervisionOptions::Check() const {
-  KALDI_ASSERT(frame_subsampling_factor > 0);
-}
 
 DiscriminativeSupervision::DiscriminativeSupervision(
     const DiscriminativeSupervision &other):
     weight(other.weight), num_sequences(other.num_sequences),
-    frames_per_sequence(other.frames_per_sequence), 
+    frames_per_sequence(other.frames_per_sequence),
     num_ali(other.num_ali), den_lat(other.den_lat) { }
 
 void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
@@ -44,7 +41,7 @@ void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
 
 bool DiscriminativeSupervision::operator == (
     const DiscriminativeSupervision &other) const {
-  return ( weight == other.weight && 
+  return ( weight == other.weight &&
       num_sequences == other.num_sequences &&
       frames_per_sequence == other.frames_per_sequence &&
       num_ali == other.num_ali &&
@@ -61,14 +58,14 @@ void DiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, frames_per_sequence);
   KALDI_ASSERT(frames_per_sequence > 0 &&
                num_sequences > 0);
-  
+
   WriteToken(os, binary, "<NumAli>");
   WriteIntegerVector(os, binary, num_ali);
 
   WriteToken(os, binary, "<DenLat>");
   if (!WriteLattice(os, binary, den_lat)) {
     // We can't return error status from this function so we
-    // throw an exception. 
+    // throw an exception.
     KALDI_ERR << "Error writing denominator lattice to stream";
   }
 
@@ -83,9 +80,9 @@ void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &num_sequences);
   ExpectToken(is, binary, "<FramesPerSeq>");
   ReadBasicType(is, binary, &frames_per_sequence);
-  KALDI_ASSERT(frames_per_sequence > 0 && 
+  KALDI_ASSERT(frames_per_sequence > 0 &&
                num_sequences > 0);
-  
+
   ExpectToken(is, binary, "<NumAli>");
   ReadIntegerVector(is, binary, &num_ali);
 
@@ -94,7 +91,7 @@ void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
     Lattice *lat = NULL;
     if (!ReadLattice(is, binary, &lat) || lat == NULL) {
       // We can't return error status from this function so we
-      // throw an exception. 
+      // throw an exception.
       KALDI_ERR << "Error reading Lattice from stream";
     }
     den_lat = *lat;
@@ -106,7 +103,7 @@ void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
 }
 
 bool DiscriminativeSupervision::Initialize(const std::vector<int32> &num_ali,
-                                           const Lattice &den_lat, 
+                                           const Lattice &den_lat,
                                            BaseFloat weight) {
   if (num_ali.size() == 0) return false;
   if (den_lat.NumStates() == 0) return false;
@@ -126,7 +123,7 @@ bool DiscriminativeSupervision::Initialize(const std::vector<int32> &num_ali,
 
 void DiscriminativeSupervision::Check() const {
   int32 num_frames_subsampled = num_ali.size();
-  KALDI_ASSERT(num_frames_subsampled == 
+  KALDI_ASSERT(num_frames_subsampled ==
                num_sequences * frames_per_sequence);
 
   {
@@ -150,14 +147,14 @@ DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
 
   den_lat_ = supervision_.den_lat;
   PrepareLattice(&den_lat_, &den_lat_scores_);
-  
+
   int32 num_states = den_lat_.NumStates(),
         num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
   KALDI_ASSERT(num_states > 0);
   int32 start_state = den_lat_.Start();
   // Lattice should be top-sorted and connected, so start-state must be 0.
   KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0");
-  
+
   KALDI_ASSERT(num_states == den_lat_scores_.state_times.size());
   KALDI_ASSERT(den_lat_scores_.state_times[start_state] == 0);
   KALDI_ASSERT(den_lat_scores_.state_times.back() == num_frames);
@@ -193,7 +190,7 @@ void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
         pdf_to_tid[t][pdf] = arc.ilabel;
       }
     }
-  }    
+  }
 }
 
 void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
@@ -204,9 +201,9 @@ void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
   // Check that the states are ordered in increasing order of state_times.
   // This must be true since the states are in breadth-first search order.
   KALDI_ASSERT(IsSorted(state_times));
-} 
+}
 
-void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize, 
+void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize,
                                                       DiscriminativeSupervision *out_supervision) const {
   int32 end_frame = begin_frame + num_frames;
   // Note: end_frame is not included in the range of frames that the
@@ -224,7 +221,7 @@ void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 n
   std::copy(supervision_.num_ali.begin() + begin_frame,
             supervision_.num_ali.begin() + end_frame,
             std::back_inserter(out_supervision->num_ali));
-  
+
   out_supervision->num_sequences = 1;
   out_supervision->weight = supervision_.weight;
   out_supervision->frames_per_sequence = num_frames;
@@ -239,19 +236,19 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
   typedef Lattice::StateId StateId;
 
   const std::vector<int32> &state_times = scores.state_times;
-  
-  // Some checks to ensure the lattice and scores are prepared properly 
+
+  // Some checks to ensure the lattice and scores are prepared properly
   KALDI_ASSERT(state_times.size() == in_lat.NumStates());
   if (!in_lat.Properties(fst::kTopSorted, true))
     KALDI_ERR << "Input lattice must be topologically sorted.";
 
   std::vector<int32>::const_iterator begin_iter =
       std::lower_bound(state_times.begin(), state_times.end(), begin_frame),
-      end_iter = std::lower_bound(begin_iter, 
+      end_iter = std::lower_bound(begin_iter,
                                   state_times.end(), end_frame);
 
   KALDI_ASSERT(*begin_iter == begin_frame &&
-               (begin_iter == state_times.begin() || 
+               (begin_iter == state_times.begin() ||
                 begin_iter[-1] < begin_frame));
   // even if end_frame == supervision_.num_frames, there should be a state with
   // that frame index.
@@ -267,10 +264,10 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
   // Add special start state
   StateId start_state = out_lat->AddState();
   out_lat->SetStart(start_state);
-  
+
   for (StateId i = begin_state; i < end_state; i++)
     out_lat->AddState();
-  
+
   // Add the special final-state.
   StateId final_state = out_lat->AddState();
   out_lat->SetFinal(final_state, LatticeWeight::One());
@@ -280,10 +277,10 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
     if (state_times[state] == begin_frame) {
       // we'd like to make this an initial state, but OpenFst doesn't allow
       // multiple initial states.  Instead we add an epsilon transition to it
-      // from our actual initial state.  The weight on this 
+      // from our actual initial state.  The weight on this
       // transition is the forward probability of the said 'initial state'
       LatticeWeight weight = LatticeWeight::One();
-      weight.SetValue1((normalize ? scores.beta[0] : 0.0) - scores.alpha[state]); 
+      weight.SetValue1((normalize ? scores.beta[0] : 0.0) - scores.alpha[state]);
       // Add negative of the forward log-probability to the graph cost score,
       // since the acoustic scores would be changed later.
       // Assuming that the lattice is scaled with appropriate acoustic
@@ -294,29 +291,29 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
       // Note: Doing a forward-backward on this split must result in a total
       // score of 0 because of the normalization.
 
-      out_lat->AddArc(start_state, 
+      out_lat->AddArc(start_state,
                       LatticeArc(0, 0, weight, output_state));
     } else {
       KALDI_ASSERT(scores.state_times[state] < end_frame);
     }
-    for (fst::ArcIterator<Lattice> aiter(in_lat, state); 
+    for (fst::ArcIterator<Lattice> aiter(in_lat, state);
           !aiter.Done(); aiter.Next()) {
       const LatticeArc &arc = aiter.Value();
       StateId nextstate = arc.nextstate;
       if (nextstate >= end_state) {
         // A transition to any state outside the range becomes a transition to
-        // our special final-state. 
-        // The weight is just the negative of the backward log-probability + 
+        // our special final-state.
+        // The weight is just the negative of the backward log-probability +
         // the arc cost. We again normalize with the total lattice score.
         LatticeWeight weight;
         //KALDI_ASSERT(scores.beta[state] < 0);
-        weight.SetValue1(arc.weight.Value1() - scores.beta[nextstate]); 
+        weight.SetValue1(arc.weight.Value1() - scores.beta[nextstate]);
         weight.SetValue2(arc.weight.Value2());
         // Add negative of the backward log-probability to the LM score, since
         // the acoustic scores would be changed later.
         // Note: We don't normalize here because that is already done with the
         // initial cost.
-      
+
         out_lat->AddArc(output_state,
             LatticeArc(arc.ilabel, arc.olabel, weight, final_state));
       } else {
@@ -350,28 +347,28 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
     }
   }
 
-  fst::TopSort(out_lat);    
+  fst::TopSort(out_lat);
   std::vector<int32> state_times_tmp;
   KALDI_ASSERT(LatticeStateTimes(*out_lat, &state_times_tmp) ==
                                             end_frame - begin_frame);
 
   // Remove the acoustic scale that was previously added
-  if (config_.supervision_config.acoustic_scale != 1.0) {
+  if (config_.acoustic_scale != 1.0) {
     fst::ScaleLattice(fst::AcousticLatticeScale(
-          1 / config_.supervision_config.acoustic_scale), out_lat);
+          1 / config_.acoustic_scale), out_lat);
   }
 }
 
 void DiscriminativeSupervisionSplitter::PrepareLattice(
     Lattice *lat, LatticeInfo *scores) const {
-  // Scale the lattice to appropriate acoustic scale. It is important to 
-  // ensure this is equal to the acoustic scale used while training. This is 
-  // because, on splitting lattices, the initial and final costs are added 
+  // Scale the lattice to appropriate acoustic scale. It is important to
+  // ensure this is equal to the acoustic scale used while training. This is
+  // because, on splitting lattices, the initial and final costs are added
   // into the graph cost.
-  KALDI_ASSERT(config_.supervision_config.acoustic_scale != 0.0);
-  if (config_.supervision_config.acoustic_scale != 1.0)
+  KALDI_ASSERT(config_.acoustic_scale != 0.0);
+  if (config_.acoustic_scale != 1.0)
     fst::ScaleLattice(fst::AcousticLatticeScale(
-          config_.supervision_config.acoustic_scale), lat);
+        config_.acoustic_scale), lat);
 
   LatticeStateTimes(*lat, &(scores->state_times));
   int32 num_states = lat->NumStates();
@@ -383,7 +380,7 @@ void DiscriminativeSupervisionSplitter::PrepareLattice(
   // Order the states based on the state times. This is stronger than just
   // topological sort. This is required by the lattice splitting code.
   std::sort(state_time_indexes.begin(), state_time_indexes.end());
-  
+
   std::vector<int32> state_order(num_states);
   for (int32 s = 0; s < num_states; s++) {
     state_order[state_time_indexes[s].second] = s;
@@ -396,9 +393,9 @@ void DiscriminativeSupervisionSplitter::PrepareLattice(
 void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat,
     LatticeInfo *scores) const {
   LatticeStateTimes(lat, &(scores->state_times));
-  ComputeLatticeAlphasAndBetas(lat, false, 
+  ComputeLatticeAlphasAndBetas(lat, false,
                                &(scores->alpha), &(scores->beta));
-  scores->Check();  
+  scores->Check();
   // This check will fail if the lattice is not breadth-first search sorted
 }
 
@@ -427,7 +424,7 @@ void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &inpu
       fst::Concat(&output_supervision->back().den_lat, src.den_lat);
 
       output_supervision->back().num_ali.insert(
-          output_supervision->back().num_ali.end(), 
+          output_supervision->back().num_ali.end(),
           src.num_ali.begin(), src.num_ali.end());
 
       output_supervision->back().num_sequences++;
@@ -448,5 +445,5 @@ void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &inpu
   }
 }
 
-} // namespace discriminative 
+} // namespace discriminative
 } // namespace kaldi
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
index c5cdc7a4107..d4c7ee3756e 100644
--- a/src/nnet3/discriminative-supervision.h
+++ b/src/nnet3/discriminative-supervision.h
@@ -29,37 +29,21 @@
 namespace kaldi {
 namespace discriminative {
 
-struct DiscriminativeSupervisionOptions {
-  int32 frame_subsampling_factor;
-  BaseFloat acoustic_scale;
-
-  DiscriminativeSupervisionOptions(): frame_subsampling_factor(1), acoustic_scale(0.1) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                   "if the frame-rate for the model will be less than the "
-                   "frame-rate of the original alignment.  Applied after "
-                   "left-tolerance and right-tolerance are applied (so they are "
-                   "in terms of the original num-frames.");
-    opts->Register("acoustic-scale", &acoustic_scale,
-                   "Scaling factor for acoustic likelihoods");
-  }
-
-  void Check() const;
-};
 
 struct SplitDiscriminativeSupervisionOptions {
+  int32 frame_subsampling_factor;
   bool remove_output_symbols;
   bool collapse_transition_ids;
   bool remove_epsilons;
   bool determinize;
   bool minimize; // we'll push and minimize if this is true.
-  DiscriminativeSupervisionOptions supervision_config;
-  
+  BaseFloat acoustic_scale;
+
   SplitDiscriminativeSupervisionOptions() :
-    remove_output_symbols(false), collapse_transition_ids(false), 
-    remove_epsilons(false), determinize(false),
-    minimize(false) { }
+      frame_subsampling_factor(1),
+      remove_output_symbols(true), collapse_transition_ids(true),
+      remove_epsilons(true), determinize(true),
+      minimize(true), acoustic_scale(0.1) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("collapse-transition-ids", &collapse_transition_ids,
@@ -76,7 +60,12 @@ struct SplitDiscriminativeSupervisionOptions {
                    "lattices (as Lattice) after splitting and possibly minimize");
     opts->Register("minimize", &minimize, "If true, we push and "
                    "minimize lattices (as Lattice) after splitting");
-    supervision_config.Register(opts);
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic likelihoods (should match the "
+                   "value used in discriminative-get-supervision)");
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                   "if the frame-rate for the model will be less than the "
+                   "frame-rate of the original alignment.");
   }
 };
 
@@ -86,13 +75,13 @@ struct SplitDiscriminativeSupervisionOptions {
 */
 
 // struct DiscriminativeSupervision is the fully-processed information for
-// a whole utterance or (after splitting) part of an utterance. 
+// a whole utterance or (after splitting) part of an utterance.
 struct DiscriminativeSupervision {
   // The weight we assign to this example;
   // this will typically be one, but we include it
-  // for the sake of generality.  
-  BaseFloat weight; 
-  
+  // for the sake of generality.
+  BaseFloat weight;
+
   // num_sequences will be 1 if you create a DiscriminativeSupervision object from a single
   // lattice or alignment, but if you combine multiple DiscriminativeSupervision objects
   // the 'num_sequences' is the number of objects that were combined (the
@@ -104,20 +93,20 @@ struct DiscriminativeSupervision {
   // Technically this information is redundant with the lattices, but it's convenient
   // to have it separately.
   int32 frames_per_sequence;
-  
+
   // The numerator alignment
   // Usually obtained by aligning the reference text with the seed neural
   // network model; can be the best path of generated lattice in the case of
   // semi-supervised training.
   std::vector<int32> num_ali;
-  
+
   // Note: any acoustic
   // likelihoods in the lattices will be
   // recomputed at the time we train.
-  
-  // The denominator lattice.  
-  Lattice den_lat; 
-  
+
+  // The denominator lattice.
+  Lattice den_lat;
+
   DiscriminativeSupervision(): weight(1.0), num_sequences(1),
                                frames_per_sequence(-1) { }
 
@@ -128,7 +117,7 @@ struct DiscriminativeSupervision {
   // and denominator lattice.  The supervision object is used for sequence
   // discriminative training.
   // Topologically sorts the lattice after copying to the supervision object.
-  // Returns false when alignment or lattice is empty 
+  // Returns false when alignment or lattice is empty
   bool Initialize(const std::vector<int32> &alignment,
                   const Lattice &lat,
                   BaseFloat weight);
@@ -136,13 +125,13 @@ struct DiscriminativeSupervision {
   void Swap(DiscriminativeSupervision *other);
 
   bool operator == (const DiscriminativeSupervision &other) const;
-  
+
   // This function checks that this supervision object satifsies some
   // of the properties we expect of it, and calls KALDI_ERR if not.
   void Check() const;
-  
-  inline int32 NumFrames() const { 
-    return num_sequences * frames_per_sequence; 
+
+  inline int32 NumFrames() const {
+    return num_sequences * frames_per_sequence;
   }
 
   void Write(std::ostream &os, bool binary) const;
@@ -156,30 +145,30 @@ class DiscriminativeSupervisionSplitter {
  public:
   typedef fst::ArcTpl<LatticeWeight> LatticeArc;
   typedef fst::VectorFst<LatticeArc> Lattice;
- 
+
   DiscriminativeSupervisionSplitter(
       const SplitDiscriminativeSupervisionOptions &config,
       const TransitionModel &tmodel,
       const DiscriminativeSupervision &supervision);
 
-  // A structure used to store the forward and backward scores 
+  // A structure used to store the forward and backward scores
   // and state times of a lattice
   struct LatticeInfo {
-    // These values are stored in log. 
+    // These values are stored in log.
     std::vector<double> alpha;
     std::vector<double> beta;
     std::vector<int32> state_times;
 
     void Check() const;
   };
-  
-  // Extracts a frame range of the supervision into 'supervision'.  
+
+  // Extracts a frame range of the supervision into 'supervision'.
   void GetFrameRange(int32 begin_frame, int32 frames_per_sequence,
                      bool normalize,
                      DiscriminativeSupervision *supervision) const;
 
   // Get the acoustic scaled denominator lattice out for debugging purposes
-  inline const Lattice& DenLat() const { return den_lat_; }  
+  inline const Lattice& DenLat() const { return den_lat_; }
 
  private:
 
@@ -187,7 +176,7 @@ class DiscriminativeSupervisionSplitter {
   // assuming that the corresponding state-range that we need to
   // include, begin_state <= s < end_state has been included.
   // (note: the output lattice will also have two special initial and final
-  // states).  
+  // states).
   // Also does post-processing (RmEpsilon, Determinize,
   // TopSort on the result).  See code for details.
   void CreateRangeLattice(const Lattice &in_lat,
@@ -201,7 +190,7 @@ class DiscriminativeSupervisionSplitter {
   // Transition model is used by the function
   // CollapseTransitionIds()
   const TransitionModel &tmodel_;
-  
+
   // A reference to the supervision object that we will be splitting
   const DiscriminativeSupervision &supervision_;
 
@@ -216,7 +205,7 @@ class DiscriminativeSupervisionSplitter {
   // Function to compute lattice scores for a lattice
   void ComputeLatticeScores(const Lattice &lat, LatticeInfo *scores) const;
 
-  // Prepare lattice : 
+  // Prepare lattice :
   // 1) Order states in breadth-first search order
   // 2) Compute states times, which must be a strictly non-decreasing vector
   // 3) Compute lattice alpha and beta scores
@@ -225,7 +214,7 @@ class DiscriminativeSupervisionSplitter {
   // Modifies the transition-ids on lat_ so that on each frame, there is just
   // one with any given pdf-id.  This allows us to determinize and minimize
   // more completely.
-  void CollapseTransitionIds(const std::vector<int32> &state_times, 
+  void CollapseTransitionIds(const std::vector<int32> &state_times,
                              Lattice *lat) const;
 
 };
@@ -241,9 +230,6 @@ void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &inpu
     bool compactify,
     std::vector<DiscriminativeSupervision> *output_supervision);
 
-typedef TableWriter<KaldiObjectHolder<DiscriminativeSupervision> > DiscriminativeSupervisionWriter;
-typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeSupervision> > SequentialDiscriminativeSupervisionReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<DiscriminativeSupervision> > RandomAccessDiscriminativeSupervisionReader;
 
 } // namespace discriminative
 } // namespace kaldi
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 9d2176965b1..35b1506336e 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -64,7 +64,7 @@ DecodableAmNnetSimple::DecodableAmNnetSimple(
     const MatrixBase<BaseFloat> *online_ivectors,
     int32 online_ivector_period,
     CachingOptimizingCompiler *compiler):
-    compiler_(am_nnet.GetNnet(), opts.optimize_config),
+    compiler_(am_nnet.GetNnet(), opts.optimize_config, opts.compiler_config),
     decodable_nnet_(opts, am_nnet.GetNnet(), am_nnet.Priors(),
                     feats, compiler != NULL ? compiler : &compiler_,
                     ivector, online_ivectors,
@@ -318,7 +318,7 @@ DecodableAmNnetSimpleParallel::DecodableAmNnetSimpleParallel(
     const VectorBase<BaseFloat> *ivector,
     const MatrixBase<BaseFloat> *online_ivectors,
     int32 online_ivector_period):
-    compiler_(am_nnet.GetNnet(), opts.optimize_config),
+    compiler_(am_nnet.GetNnet(), opts.optimize_config, opts.compiler_config),
     trans_model_(trans_model),
     feats_copy_(NULL),
     ivector_copy_(NULL),
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index acf0ba8e63a..6b382fbe033 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -51,6 +51,7 @@ struct NnetSimpleComputationOptions {
   bool debug_computation;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
+  CachingOptimizingCompilerOptions compiler_config;
 
   NnetSimpleComputationOptions():
       extra_left_context(0),
@@ -60,7 +61,9 @@ struct NnetSimpleComputationOptions {
       frame_subsampling_factor(1),
       frames_per_chunk(50),
       acoustic_scale(0.1),
-      debug_computation(false) { }
+      debug_computation(false) {
+    compiler_config.cache_capacity += frames_per_chunk;
+  }
 
   void Register(OptionsItf *opts) {
     opts->Register("extra-left-context", &extra_left_context,
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 4f9cb4b92b8..005107a097c 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -290,28 +290,6 @@ void MergeChainExamples(bool compress,
   }
 }
 
-void TruncateDerivWeights(int32 truncate,
-                          NnetChainExample *eg) {
-  for (size_t i = 0; i < eg->outputs.size(); i++) {
-    NnetChainSupervision &supervision = eg->outputs[i];
-    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
-    if (deriv_weights.Dim() == 0) {
-      deriv_weights.Resize(supervision.indexes.size());
-      deriv_weights.Set(1.0);
-    }
-    int32 num_sequences = supervision.supervision.num_sequences,
-        frames_per_sequence = supervision.supervision.frames_per_sequence;
-    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
-    for (int32 t = 0; t < truncate; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-    for (int32 t = frames_per_sequence - truncate;
-         t < frames_per_sequence; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-  }
-}
-
 void GetChainComputationRequest(const Nnet &nnet,
                                 const NnetChainExample &eg,
                                 bool need_model_derivative,
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index ac782a92805..7a024f3bfcd 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -189,15 +189,6 @@ void ShiftChainExampleTimes(int32 frame_shift,
                            const std::vector<std::string> &exclude_names,
                            NnetChainExample *eg);
 
-/**
-   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
-   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
-   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
-   sequence).
- */
-void TruncateDerivWeights(int32 truncate,
-                          NnetChainExample *eg);
-
 /**  This function takes a NnetChainExample and produces a ComputationRequest.
      Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
      can create the ComputationRequest manually.  Assumes that if
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index a7330e772a3..aa7eb48ea04 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -285,27 +285,6 @@ void MergeDiscriminativeExamples(
   }
 }
 
-void TruncateDerivWeights(int32 truncate,
-                          NnetDiscriminativeExample *eg) {
-  for (size_t i = 0; i < eg->outputs.size(); i++) {
-    NnetDiscriminativeSupervision &supervision = eg->outputs[i];
-    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
-    if (deriv_weights.Dim() == 0) {
-      deriv_weights.Resize(supervision.indexes.size());
-      deriv_weights.Set(1.0);
-    }
-    int32 num_sequences = supervision.supervision.num_sequences,
-       frames_per_sequence = supervision.supervision.frames_per_sequence;
-    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
-    for (int32 t = 0; t < truncate; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-    for (int32 t = frames_per_sequence - truncate;
-         t < frames_per_sequence; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-  }
-}
 
 void GetDiscriminativeComputationRequest(const Nnet &nnet,
                                          const NnetDiscriminativeExample &eg,
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index 9d9bba0c906..ba1cac7ffbe 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -196,15 +196,6 @@ void ShiftDiscriminativeExampleTimes(int32 frame_shift,
                                     const std::vector<std::string> &exclude_names,
                                     NnetDiscriminativeExample *eg);
 
-/**
-   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
-   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
-   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
-   sequence).
- */
-void TruncateDerivWeights(int32 truncate,
-                          NnetDiscriminativeExample *eg);
-
 /**  This function takes a NnetDiscriminativeExample and produces a
      ComputationRequest.
      Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index fd576404f1d..2bae1dcdc43 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -16,7 +16,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-get-egs nnet3-discriminative-copy-egs \
    nnet3-discriminative-merge-egs nnet3-discriminative-shuffle-egs \
    nnet3-discriminative-compute-objf nnet3-discriminative-train \
-   discriminative-get-supervision nnet3-discriminative-subset-egs \
+   nnet3-discriminative-subset-egs \
    nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped
 
 OBJFILES =
diff --git a/src/nnet3bin/discriminative-get-supervision.cc b/src/nnet3bin/discriminative-get-supervision.cc
deleted file mode 100644
index 32d66c1c55a..00000000000
--- a/src/nnet3bin/discriminative-get-supervision.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// nnet3bin/discriminative-get-supervision.cc
-
-// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014-2015  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet3/discriminative-supervision.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::discriminative;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get a discriminative training supervision object for each file of training data.\n"
-        "This will normally be piped into nnet3-discriminative-get-egs, where it\n"
-        "will be split up into pieces and combined with the features.\n"
-        "Usage: discriminative-get-supervision [options] <ali-rspecifier> \\\n" 
-        "<den-lattice-rspecifier> <supervision-wspecifier>\n";
-
-    DiscriminativeSupervisionOptions sup_opts;
-
-    ParseOptions po(usage);
-
-    sup_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string num_ali_rspecifier = po.GetArg(1),
-                den_lat_rspecifier = po.GetArg(2),
-                supervision_wspecifier = po.GetArg(3);
-
-    DiscriminativeSupervisionWriter supervision_writer(supervision_wspecifier);
-    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
-    SequentialInt32VectorReader ali_reader(num_ali_rspecifier);
-
-    int32 num_utts_done = 0, num_utts_error = 0;
-
-    for (; !ali_reader.Done(); ali_reader.Next())  {
-      const std::string &key = ali_reader.Key();
-      const std::vector<int32> &num_ali = ali_reader.Value();
-      
-      if (!den_lat_reader.HasKey(key)) {
-        KALDI_WARN << "Could not find denominator lattice for utterance "
-                   << key;
-        num_utts_error++;
-        continue;
-      }
-
-      const Lattice &den_lat = den_lat_reader.Value(key);
-
-      DiscriminativeSupervision supervision;
-
-      if (!supervision.Initialize(num_ali, den_lat, 1.0)) {
-        KALDI_WARN << "Failed to convert lattice to supervision "
-          << "for utterance " << key;
-        num_utts_error++;
-        continue;
-      }
-
-      supervision_writer.Write(key, supervision);
-      
-      num_utts_done++;
-    } 
-    
-    KALDI_LOG << "Generated discriminative supervision information for "
-              << num_utts_done << " utterances, errors on "
-              << num_utts_error;
-    return (num_utts_done > num_utts_error ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index 790c0938fdf..bab5d16f370 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
     int32 online_ivector_period = 0;
     align_config.Register(&po);
     decodable_opts.Register(&po);
-    
+
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("transition-scale", &transition_scale,
@@ -101,6 +101,7 @@ int main(int argc, char *argv[]) {
     double tot_like = 0.0;
     kaldi::int64 frame_count = 0;
 
+
     {
       TransitionModel trans_model;
       AmNnetSimple am_nnet;
@@ -110,6 +111,10 @@ int main(int argc, char *argv[]) {
         trans_model.Read(ki.Stream(), binary);
         am_nnet.Read(ki.Stream(), binary);
       }
+      // this compiler object allows caching of computations across
+      // different utterances.
+      CachingOptimizingCompiler compiler(am_nnet.GetNnet(),
+                                         decodable_opts.optimize_config);
 
       RandomAccessBaseFloatMatrixReader online_ivector_reader(
           online_ivector_rspecifier);
@@ -173,7 +178,7 @@ int main(int argc, char *argv[]) {
         DecodableAmNnetSimple nnet_decodable(
             decodable_opts, trans_model, am_nnet,
             features, ivector, online_ivectors,
-            online_ivector_period);
+            online_ivector_period, &compiler);
 
         AlignUtteranceWrapper(align_config, utt,
                               decodable_opts.acoustic_scale,
@@ -199,5 +204,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs.cc b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
index 831484ebb11..17dc2ee4e13 100644
--- a/src/nnet3bin/nnet3-discriminative-copy-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
@@ -58,7 +58,6 @@ int main(int argc, char *argv[]) {
     bool random = false;
     int32 srand_seed = 0;
     int32 frame_shift = 0;
-    int32 truncate_deriv_weights = 0;
     BaseFloat keep_proportion = 1.0;
 
     ParseOptions po(usage);
@@ -74,9 +73,6 @@ int main(int argc, char *argv[]) {
                 "in the supervision data (excluding iVector data) - useful in "
                 "augmenting data.  Note, the outputs will remain at the closest "
                 "exact multiples of the frame subsampling factor");
-    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
-                "If nonzero, the number of initial/final subsample frames that "
-                "will have their derivatives' weights set to zero.");
 
     po.Read(argc, argv);
 
@@ -106,7 +102,7 @@ int main(int argc, char *argv[]) {
       // count is normally 1; could be 0, or possibly >1.
       int32 count = GetCount(keep_proportion);
       std::string key = example_reader.Key();
-      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+      if (frame_shift == 0) {
         const NnetDiscriminativeExample &eg = example_reader.Value();
         for (int32 c = 0; c < count; c++) {
           int32 index = (random ? Rand() : num_written) % num_outputs;
@@ -117,8 +113,6 @@ int main(int argc, char *argv[]) {
         NnetDiscriminativeExample eg = example_reader.Value();
         if (frame_shift != 0)
           ShiftDiscriminativeExampleTimes(frame_shift, exclude_names, &eg);
-        if (truncate_deriv_weights != 0)
-          TruncateDerivWeights(truncate_deriv_weights, &eg);
         for (int32 c = 0; c < count; c++) {
           int32 index = (random ? Rand() : num_written) % num_outputs;
           example_writers[index]->Write(key, eg);
@@ -136,4 +130,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
index 070a88b331d..4a31876532f 100644
--- a/src/nnet3bin/nnet3-discriminative-get-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -161,18 +161,15 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Get frame-by-frame examples of data for nnet3+sequence neural network\n"
-        "training.  This involves breaking up utterances into pieces of a\n"
-        "fixed size.  Input will come from discriminative-get-supervision.\n"
+        "training.  This involves breaking up utterances into pieces of sizes\n"
+        "determined by the --num-frames option.\n"
         "\n"
         "Usage:  nnet3-discriminative-get-egs [options] <model> <features-rspecifier> "
-        "<discriminative-supervision-rspecifier> <egs-wspecifier>\n"
+        "<denominator-lattice-rspecifier> <numerator-alignment-rspecifier> <egs-wspecifier>\n"
         "\n"
         "An example [where $feats expands to the actual features]:\n"
-        "discriminative-get-supervision [args] | \\\n"
-        "  nnet3-discriminative-get-egs --left-context=25 --right-context=9 --num-frames=20 \\\n"
-        "  \"$feats\" ark,s,cs:- ark:degs.1.ark\n"
-        "Note: the --frame-subsampling-factor option must be the same as given to\n"
-        "discriminative-get-supervision.\n";
+        "  nnet3-discriminative-get-egs --left-context=25 --right-context=9 --num-frames=150,100,90 \\\n"
+        "  \"$feats\" \"ark,s,cs:gunzip -c lat.1.gz\" scp:ali.scp ark:degs.1.ark\n";
 
     bool compress = true;
     int32 length_tolerance = 100, online_ivector_period = 1;
@@ -198,13 +195,11 @@ int main(int argc, char *argv[]) {
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
 
-
-    ParseOptions splitter_opts("supervision-splitter", &po);
-    splitter_config.Register(&splitter_opts);
+    splitter_config.Register(&po);
 
     po.Read(argc, argv);
 
-    if (po.NumArgs() != 4) {
+    if (po.NumArgs() != 5) {
       po.PrintUsage();
       exit(1);
     }
@@ -212,14 +207,12 @@ int main(int argc, char *argv[]) {
     eg_config.ComputeDerived();
     UtteranceSplitter utt_splitter(eg_config);
 
-    std::string model_wxfilename, feature_rspecifier,
-                supervision_rspecifier,
-                examples_wspecifier;
+    std::string model_wxfilename = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2),
+        den_lat_rspecifier = po.GetArg(3),
+        num_ali_rspecifier = po.GetArg(4),
+        examples_wspecifier = po.GetArg(5);
 
-    model_wxfilename = po.GetArg(1);
-    feature_rspecifier = po.GetArg(2);
-    supervision_rspecifier = po.GetArg(3);
-    examples_wspecifier = po.GetArg(4);
 
     TransitionModel tmodel;
     {
@@ -229,8 +222,8 @@ int main(int argc, char *argv[]) {
     }
 
     SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    discriminative::RandomAccessDiscriminativeSupervisionReader supervision_reader(
-        supervision_rspecifier);
+    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
+    RandomAccessInt32VectorReader ali_reader(num_ali_rspecifier);
     NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
@@ -240,11 +233,23 @@ int main(int argc, char *argv[]) {
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
       const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!supervision_reader.HasKey(key)) {
-        KALDI_WARN << "No supervision for key " << key;
+      if (!den_lat_reader.HasKey(key)) {
+        KALDI_WARN << "No denominator lattice for key " << key;
+        num_err++;
+      } else if (!ali_reader.HasKey(key)) {
+        KALDI_WARN << "No numerator alignment for key " << key;
         num_err++;
       } else {
-        const discriminative::DiscriminativeSupervision &supervision = supervision_reader.Value(key);
+        discriminative::DiscriminativeSupervision supervision;
+        if (!supervision.Initialize(ali_reader.Value(key),
+                                    den_lat_reader.Value(key),
+                                    1.0)) {
+          KALDI_WARN << "Failed to convert lattice to supervision "
+                     << "for utterance " << key;
+          num_err++;
+          continue;
+        }
+
         const Matrix<BaseFloat> *online_ivector_feats = NULL;
         if (!online_ivector_rspecifier.empty()) {
           if (!online_ivector_reader.HasKey(key)) {

From bde13551efb4d30628f8b39b59b281485a4f9c6d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 14 Jan 2017 21:17:47 -0500
Subject: [PATCH 054/213] Removing option --modify-learning-rates from example
 nnet3 discriminative training scripts

---
 .../s5/local/nnet3/run_tdnn_discriminative.sh         |  3 +--
 .../s5/local/nnet3/run_tdnn_discriminative.sh         |  3 +--
 egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh  |  3 +--
 egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh   |  3 +--
 egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh |  3 +--
 .../s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh | 11 +++++------
 egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh     |  3 +--
 egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh     |  3 +--
 egs/wsj/s5/steps/nnet3/train_discriminative.sh        |  1 -
 9 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
index aa2a845d6a8..dfaf8f90da3 100644
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -48,7 +48,6 @@ regularization_opts=          # Applicable for providing --xent-regularize and -
 minibatch_size=64
 adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
-modify_learning_rates=true
 last_layer_factor=0.1
 
 ## Decode options
@@ -145,7 +144,7 @@ if [ $stage -le 4 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
index 51caba2bc98..cf26cac406a 100755
--- a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -55,7 +55,6 @@ regularization_opts=          # Applicable for providing --xent-regularize and -
 minibatch_size=64
 adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
-modify_learning_rates=true
 last_layer_factor=0.1
 
 ## Decode options
@@ -152,7 +151,7 @@ if [ $stage -le 4 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index 2f5badba26c..fbf6d64aefa 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -54,7 +54,6 @@ regularization_opts=          # Applicable for providing --xent-regularize and -
 minibatch_size=64
 adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
-modify_learning_rates=true
 last_layer_factor=0.1
 
 ## Decode options
@@ -157,7 +156,7 @@ if [ $stage -le 4 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
index 91bcaf06ccb..255f1d49882 100755
--- a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
@@ -48,7 +48,6 @@ regularization_opts=          # Applicable for providing --xent-regularize and -
 minibatch_size=64
 adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
-modify_learning_rates=true
 last_layer_factor=0.1
 
 ## Decode options
@@ -145,7 +144,7 @@ if [ $stage -le 4 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
index 0c5e05556ad..805d38b4e88 100755
--- a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -62,7 +62,6 @@ regularization_opts=          # Applicable for providing --xent-regularize and -
 minibatch_size=64
 adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
-modify_learning_rates=true
 last_layer_factor=0.1
 
 ## Decode options
@@ -156,7 +155,7 @@ if [ $stage -le 4 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
index b1f7e6f8c93..9641ce16e21 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -73,9 +73,10 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64             # we may have to reduce this.
-adjust_priors=true            # May need to be set to false
-                              # because it does not help in some setups
-modify_learning_rates=true
+adjust_priors=false           # Note: this option will eventually be removed and
+                              # the script will do it automatically but write to
+                              # a different filename
+
 last_layer_factor=0.1         # prevent the final layer from learning too fast;
                               # this can be a problem.
 
@@ -139,8 +140,6 @@ if [ -z "$degs_dir" ]; then
   fi
 fi
 
-exit 0 # TODO: remove this
-
 if [ $stage -le 3 ]; then
   steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
     --stage $train_stage \
@@ -150,7 +149,7 @@ if [ $stage -le 3 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
index b91208a0fe6..3fffd59426c 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -55,7 +55,6 @@ num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 adjust_priors=true
-modify_learning_rates=true
 last_layer_factor=0.1
 
 ## Decode options
@@ -160,7 +159,7 @@ if [ $stage -le 4 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
index 45bb36ea85c..b84688f574c 100755
--- a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -47,7 +47,6 @@ num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 adjust_priors=true
-modify_learning_rates=true
 last_layer_factor=0.1
 
 ## Decode options
@@ -144,7 +143,7 @@ if [ $stage -le 4 ]; then
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index fb75e7b0aab..8d7484aa889 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -37,7 +37,6 @@ num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
                    # versa).
 regularization_opts=
 minibatch_size=64  # This is the number of examples rather than the number of output frames.
-modify_learning_rates=false   # [deprecated]
 last_layer_factor=1.0  # relates to modify-learning-rates [deprecated]
 shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples
                 # on each iter.  You could set it to 0 or to a large value for complete

From 1a8b6b28abb5ee97b0e6bed35596da3350316fd0 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 15 Jan 2017 16:15:36 -0500
Subject: [PATCH 055/213] Various script updates/fixes for discriminative
 training scripts; fix issue that Alexander Gorodetski pointed out on list RE
 a warning.

---
 .../local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh  | 16 +++++-----------
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh          |  7 +++++--
 egs/wsj/s5/steps/nnet3/get_egs.sh                | 14 ++++++++++----
 egs/wsj/s5/steps/nnet3/make_denlats.sh           |  7 +++++--
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
index 9641ce16e21..97ed72ba429 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -17,7 +17,7 @@
 # $0 --train-set train --gmm tri3 --nnet3-affix "" &
 
 
-
+set -e
 set -uo pipefail
 
 stage=1
@@ -27,20 +27,16 @@ use_gpu=true  # for training
 cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
                # alignments and degs).
 degs_dir=  # set this to use preexisting degs.
-# nj=400 # have a high number of jobs because this could take a while, and we might
-#         # have some stragglers.
-nj=30
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
 
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
 srcdir=exp/nnet3_cleaned/tdnn_lstm1b_sp
-#train_data_dir=data/train_cleaned_sp_hires_comb
-#online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
-
-train_data_dir=data/dev_hires
-online_ivector_dir=exp/nnet3_cleaned/ivectors_dev_hires
+train_data_dir=data/train_cleaned_sp_hires_comb
+online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
 
 ## Objective options
 criterion=smbr
@@ -109,7 +105,6 @@ if [ $stage -le 1 ]; then
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --frames-per-chunk $frames_per_chunk_decoding \
     --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
-    --looped $looped \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
     --online-ivector-dir $online_ivector_dir \
     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
@@ -133,7 +128,6 @@ if [ -z "$degs_dir" ]; then
       --extra-left-context-initial 0 --extra-right-context-final 0 \
       --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
       --stage $get_egs_stage \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 3ca2fc84627..0b1ddd1fbc7 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -313,18 +313,21 @@ if [ $stage -le 3 ]; then
   rm $dir/.error 2>/dev/null
   echo "$0: ... extracting validation and training-subset alignments."
 
+  # do the filtering just once, as lat.scp may be long.
   utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
     <$dir/lat.scp >$dir/lat_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
+    utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
     chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
       ark:- ark:- \| \
     nnet3-chain-get-egs $ivector_opts --srand=$srand \
       $egs_opts $chaindir/normalization.fst \
       "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
-    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
+    utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
     chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
     nnet3-chain-get-egs $ivector_opts --srand=$srand \
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index cb7ea0ac73c..c47522fec7a 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -288,17 +288,23 @@ if [ $stage -le 3 ]; then
   rm $dir/.error 2>/dev/null
   echo "$0: ... extracting validation and training-subset alignments."
 
+
+  # do the filtering just once, as ali.scp may be long.
   utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
     <$dir/ali.scp >$dir/ali_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
+    utils/filter_scp.pl $dir/valid_uttlist $dir/ali_special.scp \| \
+    ali-to-pdf $alidir/final.mdl scp:- ark:- \| \
+    ali-to-post ark:- ark:- \| \
     nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$valid_feats" \
-    "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
-    "ark:$dir/valid_all.egs" || touch $dir/.error &
+      ark,s,cs:- "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
+    utils/filter_scp.pl $dir/train_subset_uttlist $dir/ali_special.scp \| \
+    ali-to-pdf $alidir/final.mdl scp:- ark:- \| \
+    ali-to-post ark:- ark:- \| \
     nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$train_subset_feats" \
-     "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
-     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
+      ark,s,cs:- "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh
index d1591c0b1de..b9bb9bfd2a1 100755
--- a/egs/wsj/s5/steps/nnet3/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh
@@ -3,9 +3,12 @@
 #           2014-2015   Vimal Manohar
 # Apache 2.0.
 
-# Create denominator lattices for MMI/MPE training.
+# Create denominator lattices for MMI/MPE training [deprecated].
 # This version uses the neural-net models (version 3, i.e. the nnet3 code).
 # Creates its output in $dir/lat.*.gz
+# Note: the more recent discriminative training scripts will not use this
+# script at all, they'll use get_degs.sh which combines the decoding
+# and egs-dumping into one script (to save disk space and disk I/O).
 
 # Begin configuration section.
 nj=4
@@ -174,7 +177,7 @@ fi
 
 lattice_determinize_cmd=
 if $determinize; then
-  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$lattice_beam ark:- ark:- |"
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune=true --beam=$lattice_beam ark:- ark:- |"
 fi
 
 if [ $sub_split -eq 1 ]; then

From b29eed538882bf24aa615c5337b4065dc5dc920f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 15 Jan 2017 16:59:06 -0500
Subject: [PATCH 056/213] Fix minor bugs

---
 egs/wsj/s5/steps/nnet3/get_degs.sh     | 2 +-
 src/nnet3/discriminative-supervision.h | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index cc3ab5c4b13..74e936e29e0 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -310,7 +310,7 @@ fi
 
 # set the command to determinize lattices, if specified.
 if $determinize_before_split; then
-  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --prune --beam=$lattice_beam ark:- ark:-"
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --prune=true --beam=$lattice_beam ark:- ark:-"
 else
   lattice_determinize_cmd="cat"
 fi
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
index d4c7ee3756e..a9d58d120f5 100644
--- a/src/nnet3/discriminative-supervision.h
+++ b/src/nnet3/discriminative-supervision.h
@@ -40,7 +40,6 @@ struct SplitDiscriminativeSupervisionOptions {
   BaseFloat acoustic_scale;
 
   SplitDiscriminativeSupervisionOptions() :
-      frame_subsampling_factor(1),
       remove_output_symbols(true), collapse_transition_ids(true),
       remove_epsilons(true), determinize(true),
       minimize(true), acoustic_scale(0.1) { }
@@ -63,9 +62,6 @@ struct SplitDiscriminativeSupervisionOptions {
     opts->Register("acoustic-scale", &acoustic_scale,
                    "Scaling factor for acoustic likelihoods (should match the "
                    "value used in discriminative-get-supervision)");
-    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                   "if the frame-rate for the model will be less than the "
-                   "frame-rate of the original alignment.");
   }
 };
 

From e368b154d275dd633e29331a5e2cd2786bde838d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 16 Jan 2017 03:03:33 -0500
Subject: [PATCH 057/213] Some bug fixes to I/O code for nnet3

---
 src/nnet3/nnet-computation.cc |  2 +-
 src/nnet3/nnet-compute.cc     | 12 ++++++------
 src/nnet3/nnet-optimize.cc    |  7 +++++++
 src/nnet3/nnet-training.cc    | 14 +++++++-------
 src/nnet3/nnet-utils.cc       |  4 ++--
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 5be1b7def94..819538d37f8 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -151,7 +151,7 @@ void NnetComputation::MatrixInfo::Read(std::istream &is, bool binary) {
   if (tok == "</MatrixInfo>") {
     stride_type = kDefaultStride;
   } else {
-    KALDI_ASSERT(tok == "<StrideEqualNumCols>");
+    KALDI_ASSERT(tok == "<kStrideEqualNumCols>");
     stride_type = kStrideEqualNumCols;
     ExpectToken(is, binary, "</MatrixInfo>");
   }
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index abda3646417..f15b2883989 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -153,16 +153,16 @@ void NnetComputer::ExecuteCommand() {
       case kAllocMatrixZeroed:
         m1 = computation_.submatrices[c.arg1].matrix_index;
         matrices_[m1].Resize(computation_.matrices[m1].num_rows,
-                                 computation_.matrices[m1].num_cols,
-                                 kSetZero,
-                                 computation_.matrices[m1].stride_type);
+                             computation_.matrices[m1].num_cols,
+                             kSetZero,
+                             computation_.matrices[m1].stride_type);
         break;
       case kAllocMatrixUndefined:
         m1 = computation_.submatrices[c.arg1].matrix_index;
         matrices_[m1].Resize(computation_.matrices[m1].num_rows,
-                                 computation_.matrices[m1].num_cols,
-                                 kUndefined,
-                                 computation_.matrices[m1].stride_type);
+                             computation_.matrices[m1].num_cols,
+                             kUndefined,
+                             computation_.matrices[m1].stride_type);
         break;
       case kDeallocMatrix:
         m1 = computation_.submatrices[c.arg1].matrix_index;
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index fcb0568dd5c..30b5f57feb7 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -603,6 +603,13 @@ void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) {
       request->Read(is, binary);
       NnetComputation *computation = new NnetComputation();
       computation->Read(is, binary);
+      if (GetVerboseLevel() >= 3) {
+        Timer timer;
+        CheckComputationOptions check_config;
+        ComputationChecker checker(check_config, nnet_, *computation);
+        checker.Check();
+        seconds_taken_check_ += timer.Elapsed();
+      }
       UpdateCache(request, computation);
     }
   }
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 9e534256e3f..9757452058e 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -44,11 +44,11 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
 
   if (config_.read_cache != "") {
     bool binary;
-    try {
-      Input ki(config_.read_cache, &binary);
+    Input ki;
+    if (ki.Open(config_.read_cache, &binary)) {
       compiler_.ReadCache(ki.Stream(), binary);
       KALDI_LOG << "Read computation cache from " << config_.read_cache;
-    } catch (...) {
+    } else {
       KALDI_WARN << "Could not open cached computation. "
                     "Probably this is the first training iteration.";
     }
@@ -164,14 +164,14 @@ void NnetTrainer::UpdateParamsWithMaxChange() {
     if (min_scale < 1.0)
       ostr << "Per-component max-change active on "
            << num_max_change_per_component_applied_per_minibatch
-           << " / " << num_updatable << " Updatable Components."
-           << "(smallest factor=" << min_scale << " on "
+           << " / " << num_updatable << " updatable Components; "
+           << "smallest factor=" << min_scale << " on "
            << component_name_with_min_scale
-           << " with max-change=" << max_change_with_min_scale <<"). ";
+           << " with max-change=" << max_change_with_min_scale << '.';
     if (param_delta > config_.max_param_change)
       ostr << "Global max-change factor was "
            << config_.max_param_change / param_delta
-           << " with max-change=" << config_.max_param_change << ".";
+           << " with max-change=" << config_.max_param_change << '.';
     KALDI_LOG << ostr.str();
   }
   // applies both of the max-change scalings all at once, component by component
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 02b92c19a40..30b7b12ffa7 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -71,10 +71,10 @@ void EvaluateComputationRequest(
   ComputationGraphBuilder builder(nnet, &graph);
   builder.Compute(request);
   builder.GetComputableInfo(is_computable);
-  if (GetVerboseLevel() >= 2) {
+  if (GetVerboseLevel() >= 4) {
     std::ostringstream graph_pretty;
     graph.Print(graph_pretty, nnet.GetNodeNames());
-    KALDI_VLOG(3) << "Graph is " << graph_pretty.str();
+    KALDI_VLOG(4) << "Graph is " << graph_pretty.str();
   }
 }
 

From d3d89ec85422aa21a44006a010fe075487399a72 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 16 Jan 2017 14:01:12 -0500
Subject: [PATCH 058/213] Discriminative-training script fixes

---
 .../local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh    |  6 ++++--
 egs/wsj/s5/steps/nnet3/get_degs.sh                 | 14 ++++++++++----
 egs/wsj/s5/steps/nnet3/train_discriminative.sh     |  4 ++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
index 97ed72ba429..bfebf708aad 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -68,7 +68,8 @@ max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
-minibatch_size=64             # we may have to reduce this.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
 adjust_priors=false           # Note: this option will eventually be removed and
                               # the script will do it automatically but write to
                               # a different filename
@@ -135,11 +136,12 @@ if [ -z "$degs_dir" ]; then
 fi
 
 if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
   steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
     --stage $train_stage \
     --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
     --criterion $criterion --drop-frames true \
-    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
     --adjust-priors $adjust_priors \
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 74e936e29e0..9fbaf73d82c 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -123,7 +123,7 @@ extra_files=
   extra_files="$extra_files $transform_dir/trans.1 $transform_dir/num_jobs"
 
 # Check some files.
-for f in $data/feats.scp $lang/L.fst $srcdir/${iter}.mdl $srcdir/tree \
+for f in $data/feats.scp $lang/L.fst $lang/phones/silence.csl $srcdir/${iter}.mdl $srcdir/tree \
       $srcdir/cmvn_opts $alidir/ali.1.gz $alidir/num_jobs $extra_files; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
@@ -243,6 +243,11 @@ if [ $stage -le 2 ]; then
   else # run without stderr redirection to show the error.
     feat-to-dim "$feats_one" -; exit 1
   fi
+else
+  num_frames=$(cat $dir/info/num_frames)
+fi
+if ! [ "$num_frames" -gt 0 ]; then
+  echo "$0: bad num-frames=$num_frames"; exit 1
 fi
 
 # copy the model to the degs directory.
@@ -256,6 +261,7 @@ num_archives=$[num_frames/frames_per_iter+1]
 
 echo $num_archives >$dir/info/num_archives
 echo $frame_subsampling_factor >$dir/info/frame_subsampling_factor
+cp $lang/phones/silence.csl $dir/info/
 
 # the first field in frames_per_eg (which is a comma-separated list of numbers)
 # is the 'principal' frames-per-eg, and for purposes of working out the number
@@ -409,7 +415,7 @@ function shuffle {
        $bufsz=1000; @A = (); while(<STDIN>) { push @A, $_; if (@A == $bufsz) {
        $n=int(rand()*$bufsz); print $A[$n]; $A[$n] = $A[$bufsz-1]; pop @A; }}
        @A = shuffle(@A); print @A; '
-  }
+}
 # funtion/pseudo-command to put input lines round robin to command line args.
 function round_robin {
   perl -e '@F=(); foreach $a (@ARGV) { my $f; open($f, ">$a") || die "opening file $a"; push @F, $f; }
@@ -469,11 +475,11 @@ if [ $stage -le 7 ]; then
 
   run.pl $dir/log/copy_train_subset.log \
       nnet3-discriminative-copy-egs scp:$dir/train_diagnostic.scp \
-         ark:$dir/train_diagnostic.ark  || exit 1
+         ark:$dir/train_diagnostic.degs  || exit 1
 
   run.pl $dir/log/copy_valid_subset.log \
       nnet3-discriminative-copy-egs scp:$dir/valid_diagnostic.scp \
-         ark:$dir/valid_diagnostic.ark  || exit 1
+         ark:$dir/valid_diagnostic.degs  || exit 1
 fi
 
 if [ $stage -le 10 ] && $cleanup; then
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index 8d7484aa889..139e9ba7505 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -70,7 +70,7 @@ if [ -f path.sh ]; then . ./path.sh; fi
 
 
 if [ $# != 2 ]; then
-  echo "Usage: $0 [opts] <degs-dir> <src-model> <exp-dir>"
+  echo "Usage: $0 [opts] <degs-dir> <exp-dir>"
   echo " e.g.: $0 exp/nnet3/tdnn_sp_degs exp/nnet3/tdnn_sp_smbr"
   echo ""
   echo "Main options (for others, see top of script file)"
@@ -290,7 +290,7 @@ while [ $x -lt $num_iters ]; do
           --one-silence-class=$one_silence_class \
           --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \
           $dir/$x.mdl \
-          "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
+          "ark,bg:nnet3-discriminative-copy-egs --frame-shift=$frame_shift ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait

From d82fe44e9ea9936e139fdc71b59ef89936b9b782 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 16 Jan 2017 21:48:36 -0500
Subject: [PATCH 059/213] Add scripts for discriminative training of TDNNs on
 swbd.  Small bug fix; various minor script improvements/fixes.

---
 egs/swbd/s5c/local/eval1997_data_prep.sh      |  23 ++-
 egs/swbd/s5c/local/eval2000_data_prep.sh      |  18 +-
 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh |   3 +-
 .../s5c/local/nnet3/tuning/run_tdnn_d_disc.sh | 172 ++++++++++++++++++
 egs/swbd/s5c/local/swbd1_prepare_dict.sh      |   7 +-
 .../nnet3/tuning/run_tdnn_lstm_1b_disc.sh     |  48 ++---
 src/nnet3/nnet-computation.cc                 |   2 +-
 src/nnet3bin/nnet3-am-copy.cc                 |   1 -
 8 files changed, 216 insertions(+), 58 deletions(-)
 create mode 100755 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh

diff --git a/egs/swbd/s5c/local/eval1997_data_prep.sh b/egs/swbd/s5c/local/eval1997_data_prep.sh
index f49ac551192..e29da13deee 100755
--- a/egs/swbd/s5c/local/eval1997_data_prep.sh
+++ b/egs/swbd/s5c/local/eval1997_data_prep.sh
@@ -5,13 +5,13 @@
 
 # To be run from one directory above this script.
 
-# The input is a directory name containing the 1997 Hub5 english evaluation 
+# The input is a directory name containing the 1997 Hub5 english evaluation
 # test set and transcripts, which is LDC2002S10
 # e.g. see
 # http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002S10
 #
 # It is assumed that the transcripts are in a subdirectory called transcr
-# However, we download the STM from NIST site: 
+# However, we download the STM from NIST site:
 # ftp://jaguar.ncsl.nist.gov/lvcsr/mar97/eval/hub5e97.english.980618.stm
 
 if [ $# -ne 1 ]; then
@@ -26,7 +26,7 @@ sdir=$1
 [ ! -d $sdir/transcr ] \
   && echo Expecting directory $sdir/transcr to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/eval1997
 mkdir -p $dir
@@ -40,7 +40,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -49,8 +49,8 @@ awk -v sph2pipe=$sph2pipe '{
 # segments file format is: utt-id side-id start-time end-time, e.g.:
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 pem=$sdir/speech/97_hub5e.pem
-[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1;
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 # There is one line in the 97_hub5e.pem with an extra : on the channel
 # sw_10022 B: unknown_speaker 281.21 284.37 -- the : is removed
@@ -64,7 +64,7 @@ grep -v ';;' $pem | sed -e 's?:??g' \
            printf "%s %s %.2f %.2f\n", utt, spk, start, end; }' \
   | sort -u > $dir/segments
 
- 
+
 # Download the STM and GLM files:
 ( cd $dir
   rm -f stm glm
@@ -78,9 +78,9 @@ grep -v ';;' $pem | sed -e 's?:??g' \
 
 
 # stm file has lines like:
-# en_4042 A en_4042_A 227.71 232.26 <O>  BEANS RIGHT THAT IS WHY I SAID BEANS 
-# One of the segments (sw_10022-B_028120-028437) is removed since it is not 
-# scored and does not show up in the pem file. 
+# en_4042 A en_4042_A 227.71 232.26 <O>  BEANS RIGHT THAT IS WHY I SAID BEANS
+# One of the segments (sw_10022-B_028120-028437) is removed since it is not
+# scored and does not show up in the pem file.
 grep -v ';;' $dir/hub5e97.english.980618.stm \
   | awk '{
            spk=$1"-"$2;
@@ -96,7 +96,7 @@ grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
 
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -116,4 +116,3 @@ done
 
 echo Data preparation and formatting completed for Eval 2000
 echo "(but not MFCC extraction)"
-
diff --git a/egs/swbd/s5c/local/eval2000_data_prep.sh b/egs/swbd/s5c/local/eval2000_data_prep.sh
index 8d7e1f7ed6e..4c34061a120 100755
--- a/egs/swbd/s5c/local/eval2000_data_prep.sh
+++ b/egs/swbd/s5c/local/eval2000_data_prep.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-# Hub-5 Eval 2000 data preparation 
+# Hub-5 Eval 2000 data preparation
 # Author:  Arnab Ghoshal (Jan 2013)
 
 # To be run from one directory above this script.
 
-# The input is two directory names (possibly the same) containing the 
+# The input is two directory names (possibly the same) containing the
 # 2000 Hub5 english evaluation test set and transcripts, which are
 # respectively: LDC2002S09  LDC2002T43
 # e.g. see
@@ -35,7 +35,7 @@ tdir=$2
 [ ! -d $tdir/reference ] \
   && echo Expecting directory $tdir/reference to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/eval2000
 mkdir -p $dir
@@ -49,7 +49,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -58,8 +58,8 @@ awk -v sph2pipe=$sph2pipe '{
 # segments file format is: utt-id side-id start-time end-time, e.g.:
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 pem=$sdir/english/hub5e_00.pem
-[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1;
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 # we ignore the warnings below for now, although they seem to indicate some problems
@@ -72,7 +72,7 @@ grep -v ';;' $pem \
   | sort -u | local/extend_segments.pl 0.1 > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -94,10 +94,10 @@ cp $tdir/reference/en20000405_hub5.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
index df02fec38fd..ec80972cf2d 100644
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
@@ -71,7 +71,7 @@ if [ $stage -le 9 ]; then
   relu-renorm-layer name=tdnn4 input=Append(-3,3) dim=1024
   relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn6 dim=1024
-  
+
   output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
 EOF
 
@@ -125,4 +125,3 @@ if [ $stage -le 11 ]; then
 fi
 wait;
 exit 0;
-
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
new file mode 100755
index 00000000000..715a93ea49d
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+
+# This script does discriminative training on top of the CE nnet3 system
+# from run_tdnn_d.  To simplify things, this assumes you are using the "speed-perturbed" data
+# (--speed_perturb true, which is the default) in the baseline run_tdnn_d.sh script.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+graph_dir=exp/tri4/graph_sw1_tg
+srcdir=exp/nnet3/tdnn_d_sp
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp_hires
+
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
+disc_affix=
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_decoding=200
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## Note: extra-left-context and extra-right-context are 0 because this is a TDNN,
+## it's not a recurrent model like an LSTM or BLSTM.
+extra_left_context=0
+extra_right_context=0
+
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+last_layer_factor=0.1         # prevent the final layer from learning too fast;
+                              # this can be a problem.
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_decoding \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      for iter in epoch$x epoch${x}_adj; do
+
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
+
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter}_sw1_{tg,fsh_fg} || exit 1;
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 5 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/swbd1_prepare_dict.sh b/egs/swbd/s5c/local/swbd1_prepare_dict.sh
index 673513806dc..3d9297b5f19 100755
--- a/egs/swbd/s5c/local/swbd1_prepare_dict.sh
+++ b/egs/swbd/s5c/local/swbd1_prepare_dict.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Formatting the Mississippi State dictionary for use in Edinburgh. Differs 
+# Formatting the Mississippi State dictionary for use in Edinburgh. Differs
 # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
 
 # To be run from one directory above this script.
@@ -16,7 +16,7 @@ mkdir -p $dir
 srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
 
 # assume swbd_p1_data_prep.sh was done already.
-[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
+[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
 
 cp $srcdict $dir/lexicon0.txt || exit 1;
 patch <local/dict.patch $dir/lexicon0.txt || exit 1;
@@ -60,7 +60,7 @@ cp local/MSU_single_letter.txt $dir/
 # becomes
 # -B-
 # Also, curly braces, which appear to be used for "nonstandard"
-# words or non-words, are removed, e.g. 
+# words or non-words, are removed, e.g.
 # {WOLMANIZED} W OW L M AX N AY Z D
 # -> WOLMANIZED
 # Also, mispronounced words, e.g.
@@ -90,4 +90,3 @@ ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
 popd >&/dev/null
 rm $dir/lexiconp.txt 2>/dev/null
 echo Prepared input dictionary and phone-sets for Switchboard phase 1.
-
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
index bfebf708aad..4fd74a71647 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -9,13 +9,6 @@
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
 
-# how to run this (where $0 is the name of this script)
-# by default, with the "cleaned" data:
-# $0
-
-# without the "cleaned" data:
-# $0 --train-set train --gmm tri3 --nnet3-affix "" &
-
 
 set -e
 set -uo pipefail
@@ -24,7 +17,7 @@ stage=1
 train_stage=-10 # can be used to start training in the middle.
 get_egs_stage=0
 use_gpu=true  # for training
-cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
                # alignments and degs).
 degs_dir=  # set this to use preexisting degs.
 nj=400 # have a high number of jobs because this could take a while, and we might
@@ -34,6 +27,7 @@ nj=400 # have a high number of jobs because this could take a while, and we migh
 . ./path.sh
 . ./utils/parse_options.sh
 
+graph_dir=exp/tri3_cleaned/graph
 srcdir=exp/nnet3_cleaned/tdnn_lstm1b_sp
 train_data_dir=data/train_cleaned_sp_hires_comb
 online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
@@ -58,7 +52,6 @@ frames_per_chunk_decoding=200
 ## directly in the script below, but this should also match the training condition.
 extra_left_context=40
 extra_right_context=0
-looped=true # affects alignments; because it's an LSTM, would be false for pure TDNNs or BLSTMs.
 
 
 
@@ -67,12 +60,10 @@ effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
 minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
                                       # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
-adjust_priors=false           # Note: this option will eventually be removed and
-                              # the script will do it automatically but write to
-                              # a different filename
 
 last_layer_factor=0.1         # prevent the final layer from learning too fast;
                               # this can be a problem.
@@ -117,7 +108,7 @@ if [ -z "$degs_dir" ]; then
   if [ $stage -le 2 ]; then
     if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
       utils/create_split_dir.pl \
-        /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
     fi
     if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
 
@@ -144,37 +135,36 @@ if [ $stage -le 3 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
     --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
-graph_dir=exp/tri3/graph
-if [ $stage -le 5 ]; then
+if [ $stage -le 4 ]; then
   for x in `seq $decode_start_epoch $num_epochs`; do
     for decode_set in dev test; do
-      (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
+      for iter in epoch$x epoch${x}_adj; do
 
-      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
-        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${iter:+_$iter} || exit 1;
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
 
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-        data/lang_test data/lang_rescore data/${decode_set}_hires \
-        $dir/decode_${decode_set}${iter:+_$iter} \
-        $dir/decode_${decode_set}${iter:+_$iter}_rescore || exit 1;
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_test data/lang_rescore data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter} \
+          $dir/decode_${decode_set}_${iter}_rescore || exit 1;
       ) &
     done
   done
 fi
 wait;
 
-if [ $stage -le 6 ] && $cleanup; then
+if [ $stage -le 5 ] && $cleanup; then
   # if you run with "--cleanup true --stage 6" you can clean up.
-  rm ${lats_dir}/lat.*.gz || true
-  rm ${srcdir}_ali/ali.*.gz || true
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
   steps/nnet2/remove_egs.sh ${srcdir}_degs || true
 fi
 
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 819538d37f8..5be1b7def94 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -151,7 +151,7 @@ void NnetComputation::MatrixInfo::Read(std::istream &is, bool binary) {
   if (tok == "</MatrixInfo>") {
     stride_type = kDefaultStride;
   } else {
-    KALDI_ASSERT(tok == "<kStrideEqualNumCols>");
+    KALDI_ASSERT(tok == "<StrideEqualNumCols>");
     stride_type = kStrideEqualNumCols;
     ExpectToken(is, binary, "</MatrixInfo>");
   }
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 7aa0e4a32c0..5f697356dbf 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -47,7 +47,6 @@ int main(int argc, char *argv[]) {
     bool binary_write = true,
         raw = false;
     BaseFloat learning_rate = -1;
-    BaseFloat learning_rate_scale = 1;
     std::string set_raw_nnet = "";
     bool convert_repeated_to_block = false;
     BaseFloat scale = 1.0;

From e1db393d2fa7657fe842dca1f3aafb29731ac53d Mon Sep 17 00:00:00 2001
From: Yiming Wang <freewym@gmail.com>
Date: Tue, 17 Jan 2017 02:25:06 -0500
Subject: [PATCH 060/213] [src]: Removing SetZero() function in nnet3, adding
 SetAsGradient(). (#1343)

---
 src/nnet3/nnet-chain-diagnostics.cc          |   8 +-
 src/nnet3/nnet-chain-training.cc             |   6 +-
 src/nnet3/nnet-component-itf.h               |   9 +-
 src/nnet3/nnet-component-test.cc             |  14 +-
 src/nnet3/nnet-derivative-test.cc            |   4 +-
 src/nnet3/nnet-diagnostics.cc                |   8 +-
 src/nnet3/nnet-discriminative-diagnostics.cc |   8 +-
 src/nnet3/nnet-discriminative-training.cc    |   6 +-
 src/nnet3/nnet-general-component.cc          |  17 +-
 src/nnet3/nnet-general-component.h           |   1 -
 src/nnet3/nnet-optimize-test.cc              |  11 +-
 src/nnet3/nnet-simple-component.cc           | 175 ++++++++-----------
 src/nnet3/nnet-simple-component.h            |  10 +-
 src/nnet3/nnet-training.cc                   |   6 +-
 src/nnet3/nnet-utils.cc                      |  93 +---------
 src/nnet3/nnet-utils.h                       |   4 +
 16 files changed, 126 insertions(+), 254 deletions(-)

diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index 54d73a6ead3..76abc5ce154 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -37,8 +37,8 @@ NnetChainComputeProb::NnetChainComputeProb(
     num_minibatches_processed_(0) {
   if (nnet_config_.compute_deriv) {
     deriv_nnet_ = new Nnet(nnet_);
-    bool is_gradient = true;  // force simple update
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_); // force simple update
   }
 }
 
@@ -56,8 +56,8 @@ void NnetChainComputeProb::Reset() {
   num_minibatches_processed_ = 0;
   objf_info_.clear();
   if (deriv_nnet_) {
-    bool is_gradient = true;
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_);
   }
 }
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 1e293f588ae..4f63ba8304c 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -38,9 +38,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
   KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
                opts.nnet_config.max_param_change >= 0.0);
   delta_nnet_ = nnet_->Copy();
-  bool is_gradient = false;  // setting this to true would disable the
-                             // natural-gradient updates.
-  SetZero(is_gradient, delta_nnet_);
+  ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
   num_max_change_per_component_applied_.resize(num_updatable, 0);
   num_max_change_global_applied_ = 0;
@@ -201,7 +199,7 @@ void NnetChainTrainer::UpdateParamsWithMaxChange() {
     if (param_delta > nnet_config.max_param_change) {
       if (param_delta - param_delta != 0.0) {
         KALDI_WARN << "Infinite parameter change, will not apply.";
-        SetZero(false, delta_nnet_);
+        ScaleNnet(0.0, delta_nnet_);
       } else {
         scale *= nnet_config.max_param_change / param_delta;
         num_max_change_global_applied_++;
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 9dc372340be..c1732fc9b25 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -375,11 +375,6 @@ class UpdatableComponent: public Component {
       learning_rate_factor_(other.learning_rate_factor_),
       is_gradient_(other.is_gradient_), max_change_(other.max_change_) { }
 
-  /// \brief Sets parameters to zero, and if treat_as_gradient is true,
-  ///  sets is_gradient_ to true and sets learning_rate_ to 1, ignoring
-  ///  learning_rate_factor_.
-  virtual void SetZero(bool treat_as_gradient) = 0;
-
   UpdatableComponent(): learning_rate_(0.001), learning_rate_factor_(1.0),
                         is_gradient_(false), max_change_(0.0) { }
 
@@ -403,6 +398,10 @@ class UpdatableComponent: public Component {
   /// Sets the learning rate directly, bypassing learning_rate_factor_.
   virtual void SetActualLearningRate(BaseFloat lrate) { learning_rate_ = lrate; }
 
+  /// \brief Sets is_gradient_ to true and sets learning_rate_ to 1, ignoring
+  /// learning_rate_factor_.
+  virtual void SetAsGradient() { learning_rate_ = 1.0; is_gradient_ = true; }
+
   /// Gets the learning rate of gradient descent.  Note: if you call
   /// SetLearningRate(x), and learning_rate_factor_ != 1.0,
   /// a different value than x will returned.
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index 1cb96563b77..288179b2ffe 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -108,7 +108,7 @@ void TestNnetComponentVectorizeUnVectorize(Component *c) {
   UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(c);
   KALDI_ASSERT(uc != NULL);
   UpdatableComponent *uc2 = dynamic_cast<UpdatableComponent*>(uc->Copy());
-  uc2->SetZero(false);
+  uc2->Scale(0.0);
   Vector<BaseFloat> params(uc2->NumParameters());
   uc2->Vectorize(&params);
   KALDI_ASSERT(params.Min()==0.0 && params.Sum()==0.0);
@@ -146,14 +146,14 @@ void TestNnetComponentUpdatable(Component *c) {
   }
   if(!(uc->Properties() & kUpdatableComponent)){
     // testing that if it declares itself as non-updatable,
-    // Scale() and Add() and SetZero() have no effect.
+    // Scale() and Add() have no effect.
     KALDI_ASSERT(uc->NumParameters() == 0);
     KALDI_ASSERT(uc->DotProduct(*uc) == 0);
     UpdatableComponent *uc2 = dynamic_cast<UpdatableComponent*>(uc->Copy());
     uc2->Scale(7.0);
     uc2->Add(3.0, *uc);
     KALDI_ASSERT(StringsApproxEqual(uc2->Info(), uc->Info()));
-    uc->SetZero(false);
+    uc->Scale(0.0);
     KALDI_ASSERT(StringsApproxEqual(uc2->Info(), uc->Info()));
     delete uc2;
   } else {
@@ -179,13 +179,13 @@ void TestNnetComponentUpdatable(Component *c) {
     uc3->Scale(0.5);
     KALDI_ASSERT(uc2->Info() == uc3->Info());
 
-    // testing that SetZero() works the same whether done on the vectorized
+    // testing that Scale(0.0) works the same whether done on the vectorized
     // paramters or via SetZero(), and that unvectorizing something that's been
     // zeroed gives us zero parameters.
     uc2->Vectorize(&vec2);
     vec2.SetZero();
     uc2->UnVectorize(vec2);
-    uc3->SetZero(false);
+    uc3->Scale(0.0);
     uc3->Vectorize(&vec2);
     KALDI_ASSERT(uc2->Info() == uc3->Info() && VecVec(vec2, vec2) == 0.0);
 
@@ -422,8 +422,8 @@ bool TestSimpleComponentModelDerivative(const Component &c,
   UpdatableComponent *uc_copy = dynamic_cast<UpdatableComponent*>(c_copy);
   KALDI_ASSERT(uc != NULL && uc_copy != NULL);
   if (test_derivative) {
-    bool is_gradient = true;
-    uc_copy->SetZero(is_gradient);
+    uc_copy->Scale(0.0);
+    uc_copy->SetAsGradient();
   }
 
   CuMatrix<BaseFloat> input_deriv(num_rows, input_dim,
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index 1f9e61e2b2a..f76377a544c 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -136,8 +136,8 @@ void UnitTestNnetModelDerivatives() {
     }
 
     Nnet nnet_deriv(nnet);
-    bool is_gradient = true;
-    SetZero(is_gradient, &nnet_deriv);  // forces "simple" update and unit
+    ScaleNnet(0.0, &nnet_deriv);
+    SetNnetAsGradient(&nnet_deriv);     // forces "simple" update and unit
                                         // learning rate.
 
     int32 num_directions = 4;  // must be >= 1.  Best if it's >1, will reduce
diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc
index d7de17682da..302e2cbfa50 100644
--- a/src/nnet3/nnet-diagnostics.cc
+++ b/src/nnet3/nnet-diagnostics.cc
@@ -32,8 +32,8 @@ NnetComputeProb::NnetComputeProb(const NnetComputeProbOptions &config,
     num_minibatches_processed_(0) {
   if (config_.compute_deriv) {
     deriv_nnet_ = new Nnet(nnet_);
-    bool is_gradient = true;  // force simple update
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_); // force simple update
   }
 }
 
@@ -52,8 +52,8 @@ void NnetComputeProb::Reset() {
   objf_info_.clear();
   accuracy_info_.clear();
   if (deriv_nnet_) {
-    bool is_gradient = true;
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_);
   }
 }
 
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
index 417a6fa05ac..f23af549d72 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.cc
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -42,8 +42,8 @@ NnetDiscriminativeComputeObjf::NnetDiscriminativeComputeObjf(
   log_priors_.ApplyLog();
   if (nnet_config_.compute_deriv) {
     deriv_nnet_ = new Nnet(nnet_);
-    bool is_gradient = true;  // force simple update
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_); // force simple update
   }
 }
 
@@ -61,8 +61,8 @@ void NnetDiscriminativeComputeObjf::Reset() {
   num_minibatches_processed_ = 0;
   objf_info_.clear();
   if (deriv_nnet_) {
-    bool is_gradient = true;
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_);
   }
 }
 
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index 15c91d5c23b..5ef1675c5ca 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -42,9 +42,7 @@ NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
     KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
                  opts.nnet_config.max_param_change >= 0.0);
     delta_nnet_ = nnet_->Copy();
-    bool is_gradient = false;  // setting this to true would disable the
-                               // natural-gradient updates.
-    SetZero(is_gradient, delta_nnet_);
+    ScaleNnet(0.0, delta_nnet_);
   }
   if (opts.nnet_config.read_cache != "") {
     bool binary;
@@ -92,7 +90,7 @@ void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
       if (param_delta > nnet_config.max_param_change) {
         if (param_delta - param_delta != 0.0) {
           KALDI_WARN << "Infinite parameter change, will not apply.";
-          SetZero(false, delta_nnet_);
+          ScaleNnet(0.0, delta_nnet_);
         } else {
           scale *= nnet_config.max_param_change / param_delta;
           KALDI_LOG << "Parameter change too big: " << param_delta << " > "
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 13ccb0a7714..8cfad20f19e 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1305,8 +1305,13 @@ Component* ConstantComponent::Copy() const {
 }
 
 void ConstantComponent::Scale(BaseFloat scale) {
-  if (is_updatable_)
-    output_.Scale(scale);
+  if (is_updatable_) {
+    if (scale == 0.0) {
+      output_.SetZero();
+    } else {
+      output_.Scale(scale);
+    }
+  }
 }
 
 void ConstantComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -1318,14 +1323,6 @@ void ConstantComponent::Add(BaseFloat alpha, const Component &other_in) {
   }
 }
 
-void ConstantComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  output_.SetZero();
-}
-
 void ConstantComponent::PerturbParams(BaseFloat stddev) {
   CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
   temp_output.SetRandn();
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 780ec8466e6..2ddf4f40172 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -648,7 +648,6 @@ class ConstantComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 0654683aa9c..0044ee05c51 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -94,10 +94,10 @@ static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed,
   // test the consolidation of backprop commands,
   // otherwise the optimized and non-optimized
   // comptuations differ.
-  bool is_gradient = true;  // with natural gradient, the consolidation would
-  // affect the final model params -> test just the
-  // gradient.
-  SetZero(is_gradient, &nnet_to_update);
+  ScaleNnet(0.0, &nnet_to_update);
+  // with natural gradient, the consolidation would affect the final model
+  // params -> test just the gradient.
+  SetNnetAsGradient(&nnet_to_update);
 
   NnetComputer computer(compute_opts,
                         computation,
@@ -107,7 +107,8 @@ static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed,
   Nnet nnet_opt(nnet);  // copy of the nnet for the optimized computation.
   // necessary in case backprop changes parameters.
   Nnet nnet_opt_to_update(nnet_opt);
-  SetZero(is_gradient, &nnet_opt_to_update);
+  ScaleNnet(0.0, &nnet_opt_to_update);
+  SetNnetAsGradient(&nnet_opt_to_update);
 
   // NnetComputer for the optimized version of the computation.
   NnetComputer computer_opt(compute_opts,
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 84a262b1695..93cbe467a73 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1136,8 +1136,14 @@ void RectifiedLinearComponent::StoreStats(
 }
 
 void AffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    // If scale == 0.0 we call SetZero() which will get rid of NaN's and inf's.
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
@@ -1169,17 +1175,6 @@ AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                bias_params.Dim() != 0);
 }
 
-
-
-void AffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void AffineComponent::SetParams(const VectorBase<BaseFloat> &bias,
                                 const MatrixBase<BaseFloat> &linear) {
   bias_params_ = bias;
@@ -1425,8 +1420,13 @@ RepeatedAffineComponent::RepeatedAffineComponent(const RepeatedAffineComponent &
 
 
 void RepeatedAffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -1437,15 +1437,6 @@ void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
-void RepeatedAffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void RepeatedAffineComponent::PerturbParams(BaseFloat stddev){
   CuMatrix<BaseFloat> temp_linear_params(linear_params_);
   temp_linear_params.SetRandn();
@@ -1932,8 +1923,13 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
 }
 
 void BlockAffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -1944,15 +1940,6 @@ void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
-void BlockAffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void BlockAffineComponent::PerturbParams(BaseFloat stddev) {
   CuMatrix<BaseFloat> temp_linear_params(linear_params_);
   temp_linear_params.SetRandn();
@@ -2017,7 +2004,11 @@ void BlockAffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
 }
 
 void PerElementScaleComponent::Scale(BaseFloat scale) {
-  scales_.Scale(scale);
+  if (scale == 0.0) {
+    scales_.SetZero();
+  } else {
+    scales_.Scale(scale);
+  }
 }
 
 void PerElementScaleComponent::Add(BaseFloat alpha,
@@ -2033,14 +2024,6 @@ PerElementScaleComponent::PerElementScaleComponent(
     UpdatableComponent(component),
     scales_(component.scales_) { }
 
-void PerElementScaleComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  scales_.SetZero();
-}
-
 void PerElementScaleComponent::PerturbParams(BaseFloat stddev) {
   CuVector<BaseFloat> temp_scales(scales_.Dim(), kUndefined);
   temp_scales.SetRandn();
@@ -2180,7 +2163,11 @@ void PerElementScaleComponent::UnVectorize(
 }
 
 void PerElementOffsetComponent::Scale(BaseFloat scale) {
-  offsets_.Scale(scale);
+  if (scale == 0.0) {
+    offsets_.SetZero();
+  } else {
+    offsets_.Scale(scale);
+  }
 }
 
 
@@ -2197,14 +2184,6 @@ PerElementOffsetComponent::PerElementOffsetComponent(
     UpdatableComponent(component),
     offsets_(component.offsets_) { }
 
-void PerElementOffsetComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  offsets_.SetZero();
-}
-
 void PerElementOffsetComponent::PerturbParams(BaseFloat stddev) {
   CuVector<BaseFloat> temp_offsets(offsets_.Dim(), kUndefined);
   temp_offsets.SetRandn();
@@ -2447,8 +2426,13 @@ Component* ConstantFunctionComponent::Copy() const {
 }
 
 void ConstantFunctionComponent::Scale(BaseFloat scale) {
-  if (is_updatable_)
-    output_.Scale(scale);
+  if (is_updatable_) {
+    if (scale == 0.0) {
+      output_.SetZero();
+    } else {
+      output_.Scale(scale);
+    }
+  }
 }
 
 void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -2460,14 +2444,6 @@ void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in)
   }
 }
 
-void ConstantFunctionComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  output_.SetZero();
-}
-
 void ConstantFunctionComponent::PerturbParams(BaseFloat stddev) {
   CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
   temp_output.SetRandn();
@@ -3734,8 +3710,13 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
 // scale the parameters
 void ConvolutionComponent::Scale(BaseFloat scale) {
-  filter_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    filter_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    filter_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 // add another convolution component
@@ -3976,15 +3957,6 @@ void ConvolutionComponent::Update(const std::string &debug_info,
   bias_params_.AddVec(learning_rate_, bias_grad);
 }
 
-void ConvolutionComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  filter_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void ConvolutionComponent::Read(std::istream &is, bool binary) {
   ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
   ExpectToken(is, binary, "<InputXDim>");
@@ -4796,18 +4768,6 @@ void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) {
     components_[i]->Add(alpha, *(other->components_[i]));
 }
 
-// virtual
-void CompositeComponent::SetZero(bool treat_as_gradient) {
-  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (components_[i]->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
-      uc->SetZero(treat_as_gradient);
-    }
-  }
-}
-
 // virtual
 void CompositeComponent::PerturbParams(BaseFloat stddev) {
   KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
@@ -4848,6 +4808,19 @@ void CompositeComponent::SetActualLearningRate(BaseFloat lrate) {
   }
 }
 
+// virtual
+void CompositeComponent::SetAsGradient() {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  UpdatableComponent::SetAsGradient();
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      uc->SetAsGradient();
+    }
+  }
+}
+
 // virtual
 int32 CompositeComponent::NumParameters() const {
   KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
@@ -5111,11 +5084,19 @@ Component* LstmNonlinearityComponent::Copy() const {
 }
 
 void LstmNonlinearityComponent::Scale(BaseFloat scale) {
-  params_.Scale(scale);
-  value_sum_.Scale(scale);
-  deriv_sum_.Scale(scale);
-  self_repair_total_.Scale(scale);
-  count_ *= scale;
+  if (scale == 0.0) {
+    params_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_.SetZero();
+    count_ = 0.0;
+  } else {
+    params_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_.Scale(scale);
+    count_ *= scale;
+  }
 }
 
 void LstmNonlinearityComponent::Add(BaseFloat alpha,
@@ -5130,18 +5111,6 @@ void LstmNonlinearityComponent::Add(BaseFloat alpha,
   count_ += alpha * other->count_;
 }
 
-void LstmNonlinearityComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  params_.SetZero();
-  value_sum_.SetZero();
-  deriv_sum_.SetZero();
-  self_repair_total_.SetZero();
-  count_ = 0.0;
-}
-
 void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
   CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
   temp_params.SetRandn();
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index f8cd39cb06e..62b4c9006d8 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -421,7 +421,6 @@ class AffineComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -522,7 +521,6 @@ class BlockAffineComponent : public UpdatableComponent {
   // Functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -585,7 +583,6 @@ class RepeatedAffineComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1235,7 +1232,6 @@ class PerElementScaleComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1307,7 +1303,6 @@ class PerElementOffsetComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1376,7 +1371,6 @@ class ConstantFunctionComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1579,7 +1573,6 @@ class ConvolutionComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1747,7 +1740,6 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -2003,9 +1995,9 @@ class CompositeComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void SetUnderlyingLearningRate(BaseFloat lrate);
   virtual void SetActualLearningRate(BaseFloat lrate);
+  virtual void SetAsGradient();
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 9757452058e..6bac172b5bd 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -35,9 +35,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
   KALDI_ASSERT(config.momentum >= 0.0 &&
                config.max_param_change >= 0.0);
   delta_nnet_ = nnet_->Copy();
-  bool is_gradient = false;  // setting this to true would disable the
-                             // natural-gradient updates.
-  SetZero(is_gradient, delta_nnet_);
+  ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
   num_max_change_per_component_applied_.resize(num_updatable, 0);
   num_max_change_global_applied_ = 0;
@@ -150,7 +148,7 @@ void NnetTrainer::UpdateParamsWithMaxChange() {
     if (param_delta > config_.max_param_change) {
       if (param_delta - param_delta != 0.0) {
         KALDI_WARN << "Infinite parameter change, will not apply.";
-        SetZero(false, delta_nnet_);
+        ScaleNnet(0.0, delta_nnet_);
       } else {
         scale *= config_.max_param_change / param_delta;
         num_max_change_global_applied_++;
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 30b7b12ffa7..865fdcd7c0a 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -168,25 +168,6 @@ void ComputeSimpleNnetContext(const Nnet &nnet,
       *std::max_element(right_contexts.begin(), right_contexts.end());
 }
 
-void SetZero(bool is_gradient,
-             Nnet *nnet) {
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    Component *comp = nnet->GetComponent(c);
-    NonlinearComponent *nc = dynamic_cast<NonlinearComponent*>(comp);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *u_comp = dynamic_cast<UpdatableComponent*>(comp);
-      KALDI_ASSERT(u_comp != NULL);
-      u_comp->SetZero(is_gradient);
-    } else if (nc != NULL) {
-      nc->ZeroStats();
-    } else {
-      // Scale(0.0) is called as a backup; currently it should never
-      // do anything  useful for any component type.
-      comp->Scale(0.0);
-    }
-  }
-}
-
 void PerturbParams(BaseFloat stddev,
                    Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -264,22 +245,6 @@ void ZeroComponentStats(Nnet *nnet) {
   }
 }
 
-void ScaleLearningRate(BaseFloat learning_rate_scale,
-                     Nnet *nnet) {
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    Component *comp = nnet->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      // For now all updatable components inherit from class UpdatableComponent.
-      // If that changes in future, we will change this code.
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-            "UpdatableComponent; change this code.";
-      uc->SetActualLearningRate(uc->LearningRate() * learning_rate_scale);
-    }
-  }
-}
-
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -296,68 +261,20 @@ void SetLearningRate(BaseFloat learning_rate,
   }
 }
 
-void SetLearningRates(const Vector<BaseFloat> &learning_rates,
-                     Nnet *nnet) {
-  int32 i = 0;
+void SetNnetAsGradient(Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *comp = nnet->GetComponent(c);
     if (comp->Properties() & kUpdatableComponent) {
-      // For now all updatable components inherit from class UpdatableComponent.
-      // If that changes in future, we will change this code.
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-            "UpdatableComponent; change this code.";
-      KALDI_ASSERT(i < learning_rates.Dim());
-      uc->SetActualLearningRate(learning_rates(i++));
-    }
-  }
-  KALDI_ASSERT(i == learning_rates.Dim());
-}
-
-void GetLearningRates(const Nnet &nnet,
-                      Vector<BaseFloat> *learning_rates) {
-  learning_rates->Resize(NumUpdatableComponents(nnet));
-  int32 i = 0;
-  for (int32 c = 0; c < nnet.NumComponents(); c++) {
-    const Component *comp = nnet.GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      // For now all updatable components inherit from class UpdatableComponent.
-      // If that changes in future, we will change this code.
-      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-            "UpdatableComponent; change this code.";
-      (*learning_rates)(i++) = uc->LearningRate();
-    }
-  }
-  KALDI_ASSERT(i == learning_rates->Dim());
-}
-
-void ScaleNnetComponents(const Vector<BaseFloat> &scale_factors,
-                         Nnet *nnet) {
-  int32 i = 0;
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    Component *comp = nnet->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      // For now all updatable components inherit from class UpdatableComponent.
-      // If that changes in future, we will change this code.
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-            "UpdatableComponent; change this code.";
-      KALDI_ASSERT(i < scale_factors.Dim());
-      uc->Scale(scale_factors(i++));
+      UpdatableComponent *u_comp = dynamic_cast<UpdatableComponent*>(comp);
+      KALDI_ASSERT(u_comp != NULL);
+      u_comp->SetAsGradient();
     }
   }
-  KALDI_ASSERT(i == scale_factors.Dim());
 }
 
 void ScaleNnet(BaseFloat scale, Nnet *nnet) {
   if (scale == 1.0) return;
-  else if (scale == 0.0) {
-    SetZero(false, nnet);
-  } else {
+  else {
     for (int32 c = 0; c < nnet->NumComponents(); c++) {
       Component *comp = nnet->GetComponent(c);
       comp->Scale(scale);
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 75c75842817..2bcb0fdb0f6 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -125,6 +125,10 @@ void SetLearningRate(BaseFloat learning_rate,
 /// Scales the nnet parameters and stats by this scale.
 void ScaleNnet(BaseFloat scale, Nnet *nnet);
 
+/// Sets nnet as gradient by Setting is_gradient_ to true and
+/// learning_rate_ to 1 for each UpdatableComponent in nnet
+void SetNnetAsGradient(Nnet *nnet);
+
 /// Does *dest += alpha * src (affects nnet parameters and
 /// stored stats).
 void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);

From f84d4830040ff212c7ee3e4a2b04ebb8a967c886 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 17 Jan 2017 13:41:24 -0500
Subject: [PATCH 061/213] [src]: Minor updates to sequence training and
 adjusting priors. (#1345)

---
 egs/wsj/s5/steps/nnet3/adjust_priors.sh       | 17 ++--
 .../s5/steps/nnet3/train_discriminative.sh    | 77 ++++++++-----------
 2 files changed, 37 insertions(+), 57 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/adjust_priors.sh b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
index 60d377f18e8..5a0d8454781 100755
--- a/egs/wsj/s5/steps/nnet3/adjust_priors.sh
+++ b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
@@ -21,6 +21,7 @@ egs_type=egs              # Compute from $egs_type.*.ark in $egs_dir
 use_raw_nnet=false        # If raw nnet, the averaged posterior is computed 
                           # and stored in post.$iter.vec; but there is no
                           # adjusting of priors
+minibatch_size=256
 iter=final
 
 . utils/parse_options.sh
@@ -59,20 +60,16 @@ fi
 
 rm -f $dir/post.$iter.*.vec 2>/dev/null
 
-left_context=`cat $egs_dir/info/left_context` || exit 1
-right_context=`cat $egs_dir/info/right_context` || exit 1
-
-context_opts="--left-context=$left_context --right-context=$right_context"
-
 num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
-if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
-else egs_part=JOB; fi
+if [ $num_jobs_compute_prior -gt $num_archives ]; then 
+  num_jobs_compute_prior=$num_archives
+fi
 
 if [ $egs_type != "degs" ]; then
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
     nnet3-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
     nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
     nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
     "$model" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
@@ -80,7 +77,7 @@ else
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
     nnet3-discriminative-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
     nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet3-discriminative-merge-egs ark:- ark:- \| \
+    nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
     nnet3-compute-from-degs $prior_gpu_opt --apply-exp=true \
     "$model" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
@@ -94,7 +91,7 @@ $cmd $dir/log/vector_sum.$iter.log \
 
 if ! $use_raw_nnet; then
   run.pl $dir/log/adjust_priors.$iter.log \
-    nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/$iter.adj.mdl
+    nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/${iter}_adj.mdl
 fi
 
 rm -f $dir/post.$iter.*.vec;
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index 139e9ba7505..05203ff5166 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -47,16 +47,15 @@ shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of
 
 stage=-3
 
-adjust_priors=true   # If true then it will
-
 num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                 # using GPUs.
 
 cleanup=true
-keep_model_iters=1
+keep_model_iters=100
 remove_egs=false
 src_model=  # will default to $degs_dir/final.mdl
 
+num_jobs_compute_prior=10
 
 min_deriv_time=0
 max_deriv_time_relative=0
@@ -129,11 +128,6 @@ done
 
 silphonelist=`cat $degs_dir/info/silence.csl` || exit 1;
 
-num_archives_priors=0
-if $adjust_priors; then
-  num_archives_priors=`cat $degs_dir/info/num_archives_priors` || exit 1
-fi
-
 num_archives=$(cat $degs_dir/info/num_archives) || exit 1;
 frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor)
 
@@ -200,6 +194,8 @@ if [ $stage -le -1 ]; then
 
   $cmd $dir/log/convert.log \
     nnet3-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1;
+
+  ln -sf 0.mdl $dir/epoch0.mdl
 fi
 
 
@@ -307,28 +303,11 @@ while [ $x -lt $num_iters ]; do
       nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
 
     rm $nnets_list
-
-    if [ ! -z "${iter_to_epoch[$x]}" ]; then
-      e=${iter_to_epoch[$x]}
-      ln -sf $x.mdl $dir/epoch$e.mdl
-    fi
-
-    if $adjust_priors && [ ! -z "${iter_to_epoch[$x]}" ]; then
-      if [ ! -f $degs_dir/priors_egs.1.ark ]; then
-        echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
-        echo "$0: Run this script with --adjust-priors false to not adjust priors"
-        exit 1
-      fi
-      (
-        e=${iter_to_epoch[$x]}
-        rm $dir/.error 2> /dev/null
-
-        steps/nnet3/adjust_priors.sh --egs-type priors_egs \
-          --num-jobs-compute-prior $num_archives_priors \
-          --cmd "$cmd" --use-gpu false \
-          --use-raw-nnet false --iter epoch$e $dir $degs_dir \
-          || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; }
-      ) &
+    [ ! -f $dir/$[$x+1].mdl ] && echo "$0: Did not create $dir/$[$x+1].mdl" && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%$keep_model_iters] -ne 0  ] && \
+       [ -z "${iter_to_epoch[$[$x-1]]}" ]; then
+      rm $dir/$[$x-1].mdl
     fi
 
     [ -f $dir/.error ] && { echo "Found $dir/.error. Error on iteration $x"; exit 1; }
@@ -337,28 +316,27 @@ while [ $x -lt $num_iters ]; do
   rm $dir/cache.$x 2>/dev/null || true
   x=$[$x+1]
   num_archives_processed=$[num_archives_processed+num_jobs_nnet]
-done
 
-rm $dir/final.mdl 2>/dev/null
-cp $dir/$x.mdl $dir/final.mdl
-ln -sf final.mdl $dir/epoch$num_epochs_expanded.mdl
+  if [ $stage -le $x ] && [ ! -z "${iter_to_epoch[$x]}" ]; then
+    e=${iter_to_epoch[$x]}
+    ln -sf $x.mdl $dir/epoch$e.mdl
 
-if $adjust_priors && [ $stage -le $num_iters ]; then
-  if [ ! -f $degs_dir/priors_egs.1.ark ]; then
-    echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
-    echo "$0: Run this script with --adjust-priors false to not adjust priors"
-    exit 1
-  fi
+    (
+      rm $dir/.error 2> /dev/null
 
-  steps/nnet3/adjust_priors.sh --egs-type priors_egs \
-    --num-jobs-compute-prior $num_archives_priors \
-    --cmd "$cmd $prior_queue_opt" --use-gpu false \
-    --use-raw-nnet false --iter epoch$num_epochs_expanded \
-    $dir $degs_dir || exit 1
-fi
+      steps/nnet3/adjust_priors.sh --egs-type degs \
+        --num-jobs-compute-prior $num_jobs_compute_prior \
+        --cmd "$cmd" --use-gpu false \
+        --minibatch-size $minibatch_size \
+        --use-raw-nnet false --iter epoch$e $dir $degs_dir \
+        || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; }
+    ) &
+  fi
 
-echo Done
+done
 
+rm $dir/final.mdl 2>/dev/null
+cp $dir/$x.mdl $dir/final.mdl
 
 # function to remove egs that might be soft links.
 remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
@@ -379,3 +357,8 @@ if $cleanup; then
     fi
   done
 fi
+
+wait
+[ -f $dir/.error ] && { echo "Found $dir/.error."; exit 1; }
+
+echo Done && exit 0

From e46cbac78460febaf06f756e5913210e0537b917 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 18 Jan 2017 16:17:48 -0500
Subject: [PATCH 062/213] Miscellaneous minor bug-fixes

---
 .../s5/local/nnet3/run_tdnn_discriminative.sh |   2 +-
 egs/fisher_swbd/s5/local/rt03_data_prep.sh    |  20 ++--
 .../s5/local/nnet3/run_tdnn_discriminative.sh |   2 +-
 egs/multi_en/s5/local/rt03_data_prep.sh       |  18 +--
 egs/swbd/README.txt                           |   6 +-
 .../local/nnet3/run_blstm_discriminative.sh   |   2 +-
 .../s5c/local/nnet3/run_ivector_common.sh     |   7 +-
 .../local/nnet3/run_tdnn_discriminative.sh    |   2 +-
 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh |   0
 egs/swbd/s5c/local/rt03_data_prep.sh          |  20 ++--
 egs/swbd/s5c/run.sh                           |   7 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1c.sh   |  15 ++-
 egs/tedlium/s5_r2/local/nnet3/compare_wer.sh  |  18 ++-
 egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh     | 108 ------------------
 .../nnet3/tuning/run_tdnn_lstm_1b_disc.sh     |   5 +-
 .../nnet3/train/chain_objf/acoustic_model.py  |  56 ++++-----
 egs/wsj/s5/steps/nnet3/adjust_priors.sh       |  34 +++---
 egs/wsj/s5/steps/nnet3/chain/train.py         |   1 -
 egs/wsj/s5/steps/nnet3/get_degs.sh            |   2 +
 .../s5/steps/nnet3/train_discriminative.sh    |   2 +-
 egs/wsj/s5/utils/filter_scps.pl               |   3 +-
 src/nnet3/discriminative-training.cc          |   5 +-
 22 files changed, 129 insertions(+), 206 deletions(-)
 mode change 100755 => 100644 egs/swbd/s5c/local/nnet3/run_ivector_common.sh
 mode change 100644 => 100755 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
 delete mode 100755 egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh

diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
index dfaf8f90da3..7dc82ad34d1 100644
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -154,7 +154,7 @@ if [ $stage -le 5 ]; then
     for decode_set in eval2000 rt03; do
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
+      iter=epoch${x}_adj
 
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
diff --git a/egs/fisher_swbd/s5/local/rt03_data_prep.sh b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
index a18637a6a16..d565b2b4b1a 100755
--- a/egs/fisher_swbd/s5/local/rt03_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# RT-03 data preparation (conversational telephone speech part only) 
+# RT-03 data preparation (conversational telephone speech part only)
 # Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
 
 # To be run from one directory above this script.
@@ -8,7 +8,8 @@
 # Expects the standard directory layout for RT-03
 
 if [ $# -ne 1 ]; then
-  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "Usage: $0 <rt03-dir>"
+  echo "e.g.: $0 /export/corpora/LDC/LDC2007S10"
   echo "See comments in the script for more details"
   exit 1
 fi
@@ -19,7 +20,7 @@ sdir=$1
 [ ! -d $sdir/data/references/eval03/english/cts ] \
   && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/rt03
 mkdir -p $dir
@@ -37,7 +38,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -47,7 +48,7 @@ awk -v sph2pipe=$sph2pipe '{
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 #pem=$sdir/english/hub5e_00.pem
 #[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
@@ -59,7 +60,7 @@ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | sort -u > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -77,7 +78,7 @@ cat $tdir/*.stm | \
   grep -v inter_segment_gap | \
   awk '{
            printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
-  > $dir/stm  
+  > $dir/stm
 #$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
 cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
 
@@ -87,10 +88,10 @@ cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -110,4 +111,3 @@ done
 
 echo Data preparation and formatting completed for RT-03
 echo "(but not MFCC extraction)"
-
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
index cf26cac406a..365d01cc85d 100755
--- a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -161,7 +161,7 @@ if [ $stage -le 5 ]; then
     for decode_set in test_clean test_other dev_clean dev_other; do
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
+      iter=epoch${x}_adj
 
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
diff --git a/egs/multi_en/s5/local/rt03_data_prep.sh b/egs/multi_en/s5/local/rt03_data_prep.sh
index 84955f0ed50..aa1e2ba4cc2 100755
--- a/egs/multi_en/s5/local/rt03_data_prep.sh
+++ b/egs/multi_en/s5/local/rt03_data_prep.sh
@@ -8,7 +8,7 @@
 #  - Modified paths to match multi_en naming conventions
 ###########################################################################################
 
-# RT-03 data preparation (conversational telephone speech part only) 
+# RT-03 data preparation (conversational telephone speech part only)
 # Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
 
 # To be run from one directory above this script.
@@ -16,7 +16,8 @@
 # Expects the standard directory layout for RT-03
 
 if [ $# -ne 1 ]; then
-  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "Usage: $0 <rt03-dir>"
+  echo "e.g.: $0 /export/corpora/LDC/LDC2007S10"
   echo "See comments in the script for more details"
   exit 1
 fi
@@ -45,7 +46,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -55,7 +56,7 @@ awk -v sph2pipe=$sph2pipe '{
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 #pem=$sdir/english/hub5e_00.pem
 #[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
@@ -67,7 +68,7 @@ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | sort -u > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -85,7 +86,7 @@ cat $tdir/*.stm | \
   grep -v inter_segment_gap | \
   awk '{
            printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
-  > $dir/stm  
+  > $dir/stm
 #$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
 cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
 
@@ -95,10 +96,10 @@ cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -118,4 +119,3 @@ done
 
 echo Data preparation and formatting completed for RT-03
 echo "(but not MFCC extraction)"
-
diff --git a/egs/swbd/README.txt b/egs/swbd/README.txt
index fc61a4c3060..1da570274e4 100644
--- a/egs/swbd/README.txt
+++ b/egs/swbd/README.txt
@@ -10,11 +10,14 @@ About the Switchboard corpus
     We are using the eval2000 a.k.a. hub5'00 evaluation data.  The acoustics are
     LDC2002S09 and the text is LDC2002T43.
 
+    We are also using the RT'03 test set, available as LDC2007S10.  Note: not
+    all parts of the recipe test with this.
+
 About the Fisher corpus for language modeling
 
   We use Fisher English training speech transcripts for language modeling, if
   they are available. The catalog number for part 1 transcripts is LDC2004T19,
-  and LDC2005T19 for part 2. 
+  and LDC2005T19 for part 2.
 
 Each subdirectory of this directory contains the
 scripts for a sequence of experiments.
@@ -24,4 +27,3 @@ scripts for a sequence of experiments.
   s5b: This is (somewhat less) out of date, please see s5c
 
   s5c: This is the current recipe.
-
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index fbf6d64aefa..349fd246022 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -166,7 +166,7 @@ if [ $stage -le 5 ]; then
     for decode_set in train_dev eval2000; do
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
+      iter=epoch${x}_adj
 
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
old mode 100755
new mode 100644
index 109396ed72e..894de5e58f9
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -13,6 +13,9 @@ speed_perturb=true
 mkdir -p nnet3
 # perturbed data preparation
 train_set=train_nodup
+
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
 if [ "$speed_perturb" == "true" ]; then
   if [ $stage -le 1 ]; then
     #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
@@ -81,7 +84,7 @@ for line in sys.stdin.readlines():
     utils/fix_data_dir.sh data/${dataset}_hires;
   done
 
-  for dataset in eval2000 train_dev rt03; do
+  for dataset in eval2000 train_dev $maybe_rt03; do
     # Create MFCCs for the eval set
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
@@ -133,7 +136,7 @@ if [ $stage -le 8 ]; then
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
     data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
 
-  for data_set in eval2000 train_dev rt03; do
+  for data_set in eval2000 train_dev $maybe_rt03; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
       data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
   done
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
index 255f1d49882..ceef60d0656 100755
--- a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
@@ -154,7 +154,7 @@ if [ $stage -le 5 ]; then
     for decode_set in train_dev eval2000; do
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
+      iter=epoch${x}_adj
 
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
old mode 100644
new mode 100755
diff --git a/egs/swbd/s5c/local/rt03_data_prep.sh b/egs/swbd/s5c/local/rt03_data_prep.sh
index a18637a6a16..d565b2b4b1a 100755
--- a/egs/swbd/s5c/local/rt03_data_prep.sh
+++ b/egs/swbd/s5c/local/rt03_data_prep.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# RT-03 data preparation (conversational telephone speech part only) 
+# RT-03 data preparation (conversational telephone speech part only)
 # Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
 
 # To be run from one directory above this script.
@@ -8,7 +8,8 @@
 # Expects the standard directory layout for RT-03
 
 if [ $# -ne 1 ]; then
-  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "Usage: $0 <rt03-dir>"
+  echo "e.g.: $0 /export/corpora/LDC/LDC2007S10"
   echo "See comments in the script for more details"
   exit 1
 fi
@@ -19,7 +20,7 @@ sdir=$1
 [ ! -d $sdir/data/references/eval03/english/cts ] \
   && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/rt03
 mkdir -p $dir
@@ -37,7 +38,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -47,7 +48,7 @@ awk -v sph2pipe=$sph2pipe '{
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 #pem=$sdir/english/hub5e_00.pem
 #[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
@@ -59,7 +60,7 @@ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | sort -u > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -77,7 +78,7 @@ cat $tdir/*.stm | \
   grep -v inter_segment_gap | \
   awk '{
            printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
-  > $dir/stm  
+  > $dir/stm
 #$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
 cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
 
@@ -87,10 +88,10 @@ cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -110,4 +111,3 @@ done
 
 echo Data preparation and formatting completed for RT-03
 echo "(but not MFCC extraction)"
-
diff --git a/egs/swbd/s5c/run.sh b/egs/swbd/s5c/run.sh
index 0eafe73d046..8b08419007d 100755
--- a/egs/swbd/s5c/run.sh
+++ b/egs/swbd/s5c/run.sh
@@ -72,11 +72,16 @@ fi
 # local/eval2000_data_prep.sh /home/dpovey/data/LDC2002S09/hub5e_00 /home/dpovey/data/LDC2002T43
 local/eval2000_data_prep.sh /export/corpora2/LDC/LDC2002S09/hub5e_00 /export/corpora2/LDC/LDC2002T43
 
+# prepare the rt03 data.  Note: this isn't 100% necessary for this
+# recipe, not all parts actually test using rt03.
+local/rt03_data_prep.sh /export/corpora/LDC/LDC2007S10
+
 # Now make MFCC features.
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 mfccdir=mfcc
-for x in train eval2000; do
+for x in train eval2000 $maybe_rt03; do
   steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \
     data/$x exp/make_mfcc/$x $mfccdir
   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
index 111a68d9878..f7a18b4bfcf 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
@@ -2,7 +2,20 @@
 
 # run_tdnn_1c.sh is like run_tdnn_1b.sh but changing chunk-width from 150 to
 # '140,110,160', and
-# and --trainer.num-chunk-per-minibatch from 128 to 128,64
+# and --trainer.num-chunk-per-minibatch from 128 to 128,64.
+# Not better; if anything a little worse.  But could possibly be noise.
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1b_sp_bi exp/chain_cleaned/tdnn1c_sp_bi
+# System                tdnn1b_sp_bi tdnn1c_sp_bi
+# WER on dev(orig)            9.4       9.8
+# WER on dev(rescored)        8.8       9.0
+# WER on test(orig)           9.6       9.7
+# WER on test(rescored)       9.0       9.2
+# Final train prob        -0.0870   -0.0942
+# Final valid prob        -0.1147   -0.1108
+# Final train prob (xent)   -1.4014   -1.4227
+# Final valid prob (xent)   -1.5634   -1.4884
+
 
 # run_tdnn_1b.sh is like run_tdnn_1a.sh but upgrading to xconfig-based
 # config generation.
diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
index 6aff556c142..cff39def83b 100755
--- a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
@@ -36,14 +36,28 @@ done
 
 echo -n "Final train prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.combined.log | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep log-like | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
 echo -n "Final valid prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep accuracy | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 91ba913c183..00000000000
--- a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/bash
-
-#    This is the standard "tdnn" system, built in nnet3; this script
-# is the version that's meant to run with data-cleanup, that doesn't
-# support parallel alignments.
-
-
-# by default, with cleanup:
-# local/nnet3/run_tdnn.sh
-
-# without cleanup:
-# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-
-set -e -o pipefail -u
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-train_set=train_cleaned
-gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
-                  # should have alignments for the specified training data.
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
-tdnn_affix=  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
-
-# Options which are not passed through to run_ivector_common.sh
-train_stage=-10
-splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
-remove_egs=true
-relu_dim=850
-num_epochs=3
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-
-gmm_dir=exp/${gmm}
-graph_dir=$gmm_dir/graph
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
-train_data_dir=data/${train_set}_sp_hires_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-
-if [ $stage -le 12 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/tdnn/train.sh --stage $train_stage \
-    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 12 \
-    --splice-indexes "$splice_indexes" \
-    --feat-type raw \
-    --online-ivector-dir ${train_ivector_dir} \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
-    --cmd "$decode_cmd" \
-    --relu-dim "$relu_dim" \
-    --remove-egs "$remove_egs" \
-    $train_data_dir data/lang $ali_dir $dir
-fi
-
-if [ $stage -le 13 ]; then
-  rm $dir/.error || true 2>/dev/null
-  for dset in dev test; do
-   (
-    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
-        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
-    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-
-exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
index 4fd74a71647..11bb733333d 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -144,9 +144,9 @@ if [ $stage -le 4 ]; then
     for decode_set in dev test; do
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       for iter in epoch$x epoch${x}_adj; do
-
+      (
         steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
 
         steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -154,6 +154,7 @@ if [ $stage -le 4 ]; then
           $dir/decode_${decode_set}_${iter} \
           $dir/decode_${decode_set}_${iter}_rescore || exit 1;
       ) &
+      done
     done
   done
 fi
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 48fc119ee96..69eb0f52e3b 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -26,15 +26,26 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
     This method trains a phone LM for chain training using the alignments
     in "tree_dir"
     """
+    try:
+        f = open(tree_dir + "/num_jobs", 'r')
+        num_ali_jobs = int(f.readline())
+        assert num_ali_jobs > 0
+    except:
+        raise Exception("""There was an error getting the number of alignment
+                        jobs from {0}/num_jobs""".format(tree_dir))
+
+    alignments=' '.join(['{0}/ali.{1}.gz'.format(tree_dir, job)
+                         for job in range(1, num_ali_jobs + 1)])
+
     common_lib.run_job(
         """{command} {dir}/log/make_phone_lm.log \
-                chain-est-phone-lm {lm_opts} \
-                "ark:gunzip -c {tree_dir}/ali.*.gz | \
-                    ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
-                {dir}/phone_lm.fst""".format(
-                    command=run_opts.command, dir=dir,
-                    lm_opts=lm_opts if lm_opts is not None else '',
-                    tree_dir=tree_dir))
+    gunzip -c {alignments} \| \
+    ali-to-phones {tree_dir}/final.mdl ark:- ark:- \| \
+    chain-est-phone-lm {lm_opts} ark:- {dir}/phone_lm.fst""".format(
+        command=run_opts.command, dir=dir,
+        alignments=alignments,
+        lm_opts=lm_opts if lm_opts is not None else '',
+        tree_dir=tree_dir))
 
 
 def create_denominator_fst(dir, tree_dir, run_opts):
@@ -119,7 +130,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
                      momentum, max_param_change,
                      shuffle_buffer_size, num_chunk_per_minibatch_str,
-                     frame_subsampling_factor, truncate_deriv_weights,
+                     frame_subsampling_factor,
                      cache_io_opts, run_opts):
     """
     Called from train_one_iteration(), this method trains new models
@@ -168,7 +179,6 @@ def train_new_models(dir, iter, srand, num_jobs,
                     "{raw_model}" {dir}/den.fst \
                     "ark,bg:nnet3-chain-copy-egs \
                         --left-context={lc} --right-context={rc} \
-                        --truncate-deriv-weights={trunc_deriv} \
                         --frame-shift={fr_shft} \
                         ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \
                         nnet3-chain-shuffle-egs --buffer-size={buf_size} \
@@ -181,7 +191,6 @@ def train_new_models(dir, iter, srand, num_jobs,
                         next_iter=iter + 1, job=job,
                         deriv_time_opts=" ".join(deriv_time_opts),
                         lc=left_context, rc=right_context,
-                        trunc_deriv=truncate_deriv_weights,
                         app_deriv_wts=apply_deriv_weights,
                         fr_shft=frame_shift, l2=l2_regularize,
                         xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
@@ -220,10 +229,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         l2_regularize, xent_regularize,
                         leaky_hmm_coefficient,
                         momentum, max_param_change, shuffle_buffer_size,
-                        frame_subsampling_factor, truncate_deriv_weights,
-                        run_opts,
-                        dropout_edit_string="",
-                        background_process_handler=None):
+                        frame_subsampling_factor,
+                        run_opts, background_process_handler=None):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
 
@@ -237,10 +244,9 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     if os.path.exists('{0}/srand'.format(dir)):
         try:
             saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
-        except (IOError, ValueError):
-            logger.error("Exception while reading the random seed "
-                         "for training")
-            raise
+        except (IOError, ValueError) as e:
+            raise Exception("Exception while reading the random seed "
+                            "for training: {0}".format(e.str()))
         if srand != saved_srand:
             logger.warning("The random seed provided to this iteration "
                            "(srand={0}) is different from the one saved last "
@@ -304,17 +310,6 @@ def train_one_iteration(dir, iter, srand, egs_dir,
             num_chunk_per_minibatch_str)
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
-    raw_model_string = raw_model_string + dropout_edit_string
-
-    shrink_info_str = ''
-    if shrinkage_value != 1.0:
-        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)
-
-    logger.info("On iteration {0}, learning rate is {1}"
-                "{shrink_info}.".format(
-                    iter, learning_rate,
-                    shrink_info=shrink_info_str))
-
     train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                      num_archives_processed=num_archives_processed,
                      num_archives=num_archives,
@@ -332,7 +327,6 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      shuffle_buffer_size=shuffle_buffer_size,
                      num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
                      frame_subsampling_factor=frame_subsampling_factor,
-                     truncate_deriv_weights=truncate_deriv_weights,
                      cache_io_opts=cache_io_opts, run_opts=run_opts)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
@@ -534,7 +528,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     models_to_combine.add(num_iters)
 
-    for iter in sorted(models_to_combine):
+    for iter in models_to_combine:
         model_file = '{0}/{1}.mdl'.format(dir, iter)
         if os.path.exists(model_file):
             raw_model_strings.append(
diff --git a/egs/wsj/s5/steps/nnet3/adjust_priors.sh b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
index 5a0d8454781..e8adb408590 100755
--- a/egs/wsj/s5/steps/nnet3/adjust_priors.sh
+++ b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
@@ -2,12 +2,12 @@
 
 . path.sh
 
-# This script computes the DNN output averaged over a small subset of 
+# This script computes the DNN output averaged over a small subset of
 # training egs and stores it in post.$iter.vec.
-# This is used for the purpose of adjusting the nnet priors. 
-# When --use-raw-nnet is false, then the computed priors is added into the 
-# nnet model; hence the term adjust priors. 
-# When --use-raw-nnet is true, the computed priors is not added into the 
+# This is used for the purpose of adjusting the nnet priors.
+# When --use-raw-nnet is false, then the computed priors is added into the
+# nnet model; hence the term adjust priors.
+# When --use-raw-nnet is true, the computed priors is not added into the
 # nnet model and left in the file post.$iter.vec.
 
 cmd=run.pl
@@ -16,9 +16,9 @@ num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
 use_gpu=false             # if true, we run on GPU.
 egs_type=egs              # Compute from $egs_type.*.ark in $egs_dir
                           # If --egs-type is degs, then the program
-                          # nnet3-discriminative-compute-from-egs is used 
+                          # nnet3-discriminative-compute-from-egs is used
                           # instead of nnet3-compute-from-egs.
-use_raw_nnet=false        # If raw nnet, the averaged posterior is computed 
+use_raw_nnet=false        # If raw nnet, the averaged posterior is computed
                           # and stored in post.$iter.vec; but there is no
                           # adjusting of priors
 minibatch_size=256
@@ -45,43 +45,42 @@ else
   prior_queue_opt=""
 fi
 
-for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do 
+for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do
   if [ ! -f $f ]; then
-    echo "$f not found" 
-    exit 1 
+    echo "$f not found"
+    exit 1
   fi
 done
 
 if $use_raw_nnet; then
   model=$dir/$iter.raw
-else 
+else
   model="nnet3-am-copy --raw=true $dir/$iter.mdl - |"
 fi
 
 rm -f $dir/post.$iter.*.vec 2>/dev/null
 
 num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
-if [ $num_jobs_compute_prior -gt $num_archives ]; then 
+if [ $num_jobs_compute_prior -gt $num_archives ]; then
   num_jobs_compute_prior=$num_archives
 fi
 
 if [ $egs_type != "degs" ]; then
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
-    nnet3-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-copy-egs ark:$egs_dir/$egs_type.$JOB.ark ark:- \| \
     nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
     nnet3-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
     nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
     "$model" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
-else 
+else
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
-    nnet3-discriminative-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-discriminative-copy-egs ark:$egs_dir/degs.JOB.ark ark:- \| \
     nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
     nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
-    nnet3-compute-from-degs $prior_gpu_opt --apply-exp=true \
+    nnet3-discriminative-compute-from-egs $prior_gpu_opt --apply-exp=true \
     "$model" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
-
 fi
 
 sleep 3;  # make sure there is time for $dir/post.$iter.*.vec to appear.
@@ -95,4 +94,3 @@ if ! $use_raw_nnet; then
 fi
 
 rm -f $dir/post.$iter.*.vec;
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 374e1036f00..90e11d0a83e 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -469,7 +469,6 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
                 frame_subsampling_factor=args.frame_subsampling_factor,
-                truncate_deriv_weights=args.truncate_deriv_weights,
                 run_opts=run_opts,
                 background_process_handler=background_process_handler)
 
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 9fbaf73d82c..65704fe9894 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -159,6 +159,8 @@ case $feat_type in
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 
+cp $srcdir/{splice_opts,cmvn_opts} $dir 2>/dev/null || true
+
 if [ ! -z "$transform_dir" ]; then
   echo "$0: using transforms from $transform_dir"
   [ ! -s $transform_dir/num_jobs ] && \
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index 05203ff5166..eb1a616e9de 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -329,7 +329,7 @@ while [ $x -lt $num_iters ]; do
         --cmd "$cmd" --use-gpu false \
         --minibatch-size $minibatch_size \
         --use-raw-nnet false --iter epoch$e $dir $degs_dir \
-        || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; }
+        || { touch $dir/.error; echo "Error in adjusting priors. See errors above."; exit 1; }
     ) &
   fi
 
diff --git a/egs/wsj/s5/utils/filter_scps.pl b/egs/wsj/s5/utils/filter_scps.pl
index 07e59d6ba80..418f8f73e1b 100755
--- a/egs/wsj/s5/utils/filter_scps.pl
+++ b/egs/wsj/s5/utils/filter_scps.pl
@@ -165,5 +165,6 @@
   print STDERR "filter_scps.pl: warning: some input lines did not get output\n";
 }
 if ($warn_multiply_covered && $print_warnings) {
-  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt]\n";
+  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt] " .
+    join(" ", @ARGV) . "\n";
 }
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
index 438a01aafd9..4a32236c9ff 100644
--- a/src/nnet3/discriminative-training.cc
+++ b/src/nnet3/discriminative-training.cc
@@ -594,7 +594,7 @@ void DiscriminativeObjectiveInfo::Print(const std::string &criterion,
   } else if (criterion == "mpfe") {
     double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
     double objf = tot_objf / tot_t_weighted;
-    KALDI_LOG << "Average modulus of MPFE gradients is " << avg_gradients
+    KALDI_LOG << "Average num+den count of MPFE stats is " << avg_gradients
               << " per frame, over "
               << tot_t_weighted << " frames";
     KALDI_LOG << "MPFE objective function is " << objf
@@ -602,7 +602,7 @@ void DiscriminativeObjectiveInfo::Print(const std::string &criterion,
   } else if (criterion == "smbr") {
     double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
     double objf = tot_objf / tot_t_weighted;
-    KALDI_LOG << "Average modulus of SMBR gradients is " << avg_gradients
+    KALDI_LOG << "Average num+den count of SMBR stats is " << avg_gradients
               << " per frame, over "
               << tot_t_weighted << " frames";
     KALDI_LOG << "SMBR objective function is " << objf
@@ -642,4 +642,3 @@ void DiscriminativeObjectiveInfo::PrintAvgGradientForPdf(int32 pdf_id) const {
 
 }  // namespace discriminative
 }  // namespace kaldi
-

From 90c88a7c6cb184db2a0abcee8406aeba6a114a3b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 18 Jan 2017 19:37:20 -0500
Subject: [PATCH 063/213] Various minor fixes and script updates

---
 egs/swbd/s5c/local/nnet3/run_ivector_common.sh    | 15 ++-------------
 .../s5c/local/nnet3/tuning/run_tdnn_d_disc.sh     |  2 +-
 .../local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh   | 15 +++++++++------
 egs/wsj/s5/steps/nnet3/train_discriminative.sh    |  6 +-----
 src/nnet3/nnet-chain-training.cc                  |  1 +
 5 files changed, 14 insertions(+), 25 deletions(-)
 mode change 100644 => 100755 egs/swbd/s5c/local/nnet3/run_ivector_common.sh

diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
old mode 100644
new mode 100755
index 894de5e58f9..9768d82c806
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -62,18 +62,7 @@ if [ $stage -le 3 ]; then
   for dataset in $train_set train_100k_nodup; do
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
 
-    # scale the waveforms, this is useful as we don't use CMVN
-    data_dir=data/${dataset}_hires
-    cat $data_dir/wav.scp | python -c "
-import sys, os, subprocess, re, random
-scale_low = 1.0/8
-scale_high = 2.0
-for line in sys.stdin.readlines():
-  if len(line.strip()) == 0:
-    continue
-  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
-"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
-    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+    utils/data/perturb_data_dir_volume.sh adata/${dataset}_hires
 
     steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
         --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
@@ -131,7 +120,7 @@ if [ $stage -le 8 ]; then
 
   # having a larger number of speakers is helpful for generalization, and to
   # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
     data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
index 715a93ea49d..da7cae954f8 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
@@ -147,7 +147,7 @@ if [ $stage -le 4 ]; then
       for iter in epoch$x epoch${x}_adj; do
 
         steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
 
         steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
index 11bb733333d..9a77a6af6c7 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -37,7 +37,7 @@ criterion=smbr
 one_silence_class=true
 
 # you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
-disc_affix=
+disc_affix=slow
 
 dir=${srcdir}_${criterion}${disc_affix}
 
@@ -56,10 +56,10 @@ extra_right_context=0
 
 
 ## Nnet training options
-effective_learning_rate=0.0000125
+effective_learning_rate=0.000005
 max_param_change=1
 num_jobs_nnet=4
-num_epochs=4
+num_epochs=2
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
                               # in chain models.
 minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
@@ -146,11 +146,14 @@ if [ $stage -le 4 ]; then
       for iter in epoch$x epoch${x}_adj; do
       (
         steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
-          --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
-          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
 
         steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-          data/lang_test data/lang_rescore data/${decode_set}_hires \
+          data/lang data/lang_rescore data/${decode_set}_hires \
           $dir/decode_${decode_set}_${iter} \
           $dir/decode_${decode_set}_${iter}_rescore || exit 1;
       ) &
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index eb1a616e9de..77198a00576 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -258,11 +258,7 @@ while [ $x -lt $num_iters ]; do
         fi
 
         if $use_frame_shift; then
-          if [ $[num_archives % frame_subsampling_factor] -ne 0 ]; then
-            frame_shift=$[k % frame_subsampling_factor]
-          else
-            frame_shift=$[(k + k/num_archives) % frame_subsampling_factor]
-          fi
+          frame_shift=$[(k%num_archives + k/num_archives) % frame_subsampling_factor]
         else
           frame_shift=0
         fi
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 4f63ba8304c..c3ae3ae0336 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -240,6 +240,7 @@ bool NnetChainTrainer::PrintTotalStats() const {
     const ObjectiveFunctionInfo &info = iter->second;
     ans = info.PrintTotalStats(name) || ans;
   }
+  PrintMaxChangeStats();
   return ans;
 }
 

From d9b27f2365b81c8fec5c486e56693a4ea099443e Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 1 Dec 2016 12:40:14 -0800
Subject: [PATCH 064/213] Update tools/Makefile to support OpenFst-1.5.4.

---
 tools/Makefile | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index 9fdc35da402..0f5af6c7452 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -4,10 +4,11 @@ CXX = g++
 # CXX = clang++  # Uncomment this line to build with Clang.
 CC = gcc    # used for sph2pipe
 
-
 OPENFST_VERSION = 1.3.4
 # Uncomment the next line to build with OpenFst-1.4.1.
 # OPENFST_VERSION = 1.4.1
+# Uncomment the next line to build with OpenFst-1.5.4.
+# OPENFST_VERSION = 1.5.4
 # Note: OpenFst >= 1.4 requires C++11 support, hence you will need to use a
 # relatively recent C++ compiler, e.g. gcc >= 4.6, clang >= 3.0.
 
@@ -20,11 +21,11 @@ ifeq ($(OPENFST_VERSION), 1.3.4)
     CXXFLAGS += -stdlib=libstdc++
     LDFLAGS += -stdlib=libstdc++
   endif
+else ifeq ($(OPENFST_VERSION), 1.4.1)
+else ifeq ($(OPENFST_VERSION), 1.5.4)
 else
-  ifneq ($(OPENFST_VERSION), 1.4.1)
     $(error OpenFst version $(OPENFST_VERSION) is not supported. \
-            Supported versions: 1.3.4, 1.4.1)
-  endif
+            Supported versions: 1.3.4, 1.4.1, 1.5.4)
 endif
 
 all: check_required_programs sph2pipe atlas sclite openfst
@@ -92,12 +93,14 @@ else
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 endif
 
-# patches for openfst. openfst_gcc41up.patch is a patch for openfst to \
-# support multi-threads when compile with g++ (gcc) version above 4.1
+# patches for openfst. openfst_gcc41up.patch is a patch for openfst to
+# support multi-threading when compiling with gcc >= 4.1.
 openfst-$(OPENFST_VERSION)/.patched: | openfst-$(OPENFST_VERSION)
+ifneq ($(OPENFST_VERSION), 1.5.4)
 	cd openfst-$(OPENFST_VERSION)/; \
 	patch -p1 -N < ../extras/openfst-$(OPENFST_VERSION).patch;
 	$(CXX) -dumpversion | awk '{if(NR==1 && $$1>"4.1") print "cd openfst-$(OPENFST_VERSION)/src/include/fst; patch -c -p0 -N < ../../../../extras/openfst_gcc41up.patch"}' | sh -
+endif
 	touch $@
 
 openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz

From 2bc558b58c717ef826a74bfb472aa13278590e1f Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 1 Dec 2016 13:26:49 -0800
Subject: [PATCH 065/213] Update src/configure to support OpenFst-1.5.4.

---
 src/configure                 | 40 +++++++++++++++++------------------
 src/makefiles/darwin_10_10.mk |  3 ++-
 src/makefiles/darwin_10_11.mk |  3 ++-
 src/makefiles/darwin_10_12.mk |  3 ++-
 src/makefiles/darwin_10_9.mk  |  3 ++-
 5 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/configure b/src/configure
index bb8f5d4cb4d..c70fbc322d5 100755
--- a/src/configure
+++ b/src/configure
@@ -960,20 +960,6 @@ if [ ! -f makefiles/common.mk ]; then
     failure makefiles/common.mk not found
 fi
 
-
-echo "Checking OpenFST library in $FSTROOT ..."
-if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
-    failure "Could not find file $FSTROOT/include/fst/fst.h:
-    you may not have installed OpenFst.  See ../tools/INSTALL"
-fi
-echo Checking OpenFst library was patched.
-if ! grep "multiple repeated" $FSTROOT/include/fst/minimize.h >/dev/null; then
-    echo "**  ERROR  **"
-    echo "** $FSTROOT/include/fst/minimize.h seems not to be patched:"
-    echo "patch not applied?  FST tools will not work in our recipe."
-    exit 1;
-fi
-
 # back up the old one in case we modified it
 if [ -f kaldi.mk ]; then
   echo "Backing up kaldi.mk to kaldi.mk.bak"
@@ -990,15 +976,29 @@ fi
 echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
 echo "FSTROOT = $FSTROOT" >> kaldi.mk
 
-# Check installed OpenFst version and add C++11 flags if OpenFst >= 1.4
+echo "Checking OpenFST library in $FSTROOT ..."
+if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
+    failure "Could not find file $FSTROOT/include/fst/fst.h:
+    you may not have installed OpenFst.  See ../tools/INSTALL"
+fi
+
 OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
+echo "Adding flags necessary for compiling against OpenFst-$OPENFST_VER ..."
 echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
 OPENFST_VER_NUM=`echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"`
+echo "EXTRA_CXXFLAGS += -DOPENFST_VER=$OPENFST_VER_NUM" >> kaldi.mk
 if [ $OPENFST_VER_NUM -ge 10400 ]; then
-  echo "OPENFST_GE_10400 = 1" >> kaldi.mk
-  echo "EXTRA_CXXFLAGS += -DHAVE_OPENFST_GE_10400 -std=c++0x" >> kaldi.mk
-else
-  echo "OPENFST_GE_10400 = 0" >> kaldi.mk
+  echo "EXTRA_CXXFLAGS += -std=c++0x" >> kaldi.mk
+fi
+
+if [ $OPENFST_VER_NUM -lt 10500 ]; then
+  echo "Checking if OpenFst library was patched ..."
+  if ! grep "multiple repeated" $FSTROOT/include/fst/minimize.h >/dev/null; then
+    echo "**  ERROR  **"
+    echo "** $FSTROOT/include/fst/minimize.h seems not to be patched:"
+    echo "patch not applied?  FST tools will not work in our recipe."
+    exit 1;
+  fi
 fi
 
 # Most of the OS-specific steps below will append to kaldi.mk
@@ -1044,7 +1044,6 @@ fi
 # which crashes on Darwin. Also the linear algebra libraries on Macs are
 # used differently (through the Accelerate framework) than on Linux.
 if [ "`uname`" == "Darwin"  ]; then
- $use_cuda && configure_cuda
   echo "On Darwin: checking for Accelerate framework ..."
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
     failure "Need the Accelerate.framework to compile on Darwin."
@@ -1090,6 +1089,7 @@ if [ "`uname`" == "Darwin"  ]; then
   else
     failure "OS X version '$osx_ver' not supported"
   fi
+  $use_cuda && configure_cuda
   echo "Configuration succeeded for platform Darwin."
   exit_success;
 fi
diff --git a/src/makefiles/darwin_10_10.mk b/src/makefiles/darwin_10_10.mk
index dcb35b0c59e..77d82708b1e 100644
--- a/src/makefiles/darwin_10_10.mk
+++ b/src/makefiles/darwin_10_10.mk
@@ -33,7 +33,8 @@ COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
   # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
+	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
     CXXFLAGS += -stdlib=libstdc++
     LDFLAGS += -stdlib=libstdc++
   endif
diff --git a/src/makefiles/darwin_10_11.mk b/src/makefiles/darwin_10_11.mk
index 73cd006735e..c3b11a49cfc 100644
--- a/src/makefiles/darwin_10_11.mk
+++ b/src/makefiles/darwin_10_11.mk
@@ -33,7 +33,8 @@ COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
   # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
+	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
     CXXFLAGS += -stdlib=libstdc++
     LDFLAGS += -stdlib=libstdc++
   endif
diff --git a/src/makefiles/darwin_10_12.mk b/src/makefiles/darwin_10_12.mk
index 68f50f01d51..46e05cc3427 100644
--- a/src/makefiles/darwin_10_12.mk
+++ b/src/makefiles/darwin_10_12.mk
@@ -33,7 +33,8 @@ COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
   # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
+	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
     CXXFLAGS += -stdlib=libstdc++
     LDFLAGS += -stdlib=libstdc++
   endif
diff --git a/src/makefiles/darwin_10_9.mk b/src/makefiles/darwin_10_9.mk
index 0069372c8ef..f3e8817503e 100644
--- a/src/makefiles/darwin_10_9.mk
+++ b/src/makefiles/darwin_10_9.mk
@@ -33,7 +33,8 @@ COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
   # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
+	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
     CXXFLAGS += -stdlib=libstdc++
     LDFLAGS += -stdlib=libstdc++
   endif

From b6b55d877748a4196fd764ad34121e452a100e1f Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 1 Dec 2016 13:29:57 -0800
Subject: [PATCH 066/213] Fix unqualified std::vector occurrences.

---
 src/decoder/nbest-decoder.h              | 18 +++++++-------
 src/kws/kws-functions.cc                 | 16 ++++++-------
 src/lat/lattice-functions.h              |  9 ++++---
 src/lat/sausages.cc                      | 30 ++++++++++++------------
 src/latbin/lattice-oracle.cc             |  6 ++---
 src/latbin/lattice-rescore-mapped.cc     | 12 +++++-----
 src/latbin/nbest-to-linear.cc            | 12 +++++-----
 src/nnet2bin/nnet-am-average.cc          |  7 +++---
 src/nnet2bin/nnet-normalize-stddev.cc    | 14 +++++------
 src/nnetbin/nnet-train-mmi-sequential.cc |  2 +-
 src/nnetbin/nnet-train-mpe-sequential.cc |  2 +-
 src/online2/online-ivector-feature.cc    |  4 ++--
 12 files changed, 65 insertions(+), 67 deletions(-)

diff --git a/src/decoder/nbest-decoder.h b/src/decoder/nbest-decoder.h
index 8db071d6591..daecc84e7b2 100644
--- a/src/decoder/nbest-decoder.h
+++ b/src/decoder/nbest-decoder.h
@@ -179,7 +179,7 @@ class NBestDecoder {
         continue; // skip that token
       }
       LatticeWeight path_w(lmscore, amscore);
-      CompactLatticeWeight path_weight(path_w, vector<int32>());
+      CompactLatticeWeight path_weight(path_w, std::vector<int32>());
 
       std::vector<CompactLatticeArc*> arcs_reverse; // reverse order output arcs
       // outer loop for word tokens
@@ -230,8 +230,8 @@ class NBestDecoder {
   //    ShortestPath(fst, &fst_one);
   //    ConvertLattice(fst_one, fst_out, true);
   //    return true;
-  //  } 
-  
+  //  }
+
  private:
 
   // TokenStore is a store of linked tokens with its own allocator
@@ -388,7 +388,7 @@ class NBestDecoder {
         return tok2;
       }
     }
-    
+
     inline bool CombineN(Elem *head, Token *new_tok) { // n-best version
       if (!new_tok) return false;
       Elem *e = head;
@@ -435,7 +435,7 @@ class NBestDecoder {
     }
     inline Token* Advance(Token *source, Arc &arc, int32 frame,
                           BaseFloat cutoff) {
-      // compute new weight    
+      // compute new weight
       Weight w = Times(source->c, arc.weight);
       Weight amscore = Weight::One();
       if (arc.ilabel > 0) { // emitting arc
@@ -446,7 +446,7 @@ class NBestDecoder {
       if (w.Value() > cutoff) {  // prune
         return NULL;
       }
-      // create new token  
+      // create new token
       Token *tok;
       if (arc.olabel > 0) { // create new token
         // find or create corresponding Token
@@ -593,10 +593,10 @@ class NBestDecoder {
         // KALDI_ASSERT(state == tok->arc_.nextstate);
         for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
              !aiter.Done(); aiter.Next()) {
-          // for all a in A(state)    
+          // for all a in A(state)
           Arc arc = aiter.Value();
           if (arc.ilabel != 0) {  // propagate only emitting
-            Token *new_tok = 
+            Token *new_tok =
                 token_store_.Advance(tok, arc, frame, next_weight_cutoff);
             if (new_tok) {
               Elem *e_found = toks_.Find(arc.nextstate);
@@ -637,7 +637,7 @@ class NBestDecoder {
       queue_.erase(queue_.begin());
       Elem *elem = toks_.Find(state);  // would segfault if state not
       // in toks_ but this can't happen.
-      
+
       // we have to pop all tokens with the same state
       // this may create some unneccessary repetitions, since only the new token
       // needs to be forwarded, but I don't know yet how to solve this
diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc
index 8cb82c7bb0f..d1d71ce7a42 100644
--- a/src/kws/kws-functions.cc
+++ b/src/kws/kws-functions.cc
@@ -38,12 +38,12 @@ bool CompareInterval(const Interval &i1,
 }
 
 bool ClusterLattice(CompactLattice *clat,
-                    const vector<int32> &state_times) {
+                    const std::vector<int32> &state_times) {
   using namespace fst;
   typedef CompactLattice::StateId StateId;
 
   // Hashmap to store the cluster heads.
-  unordered_map<StateId, vector<Interval> > head;
+  unordered_map<StateId, std::vector<Interval> > head;
 
   // Step 1: Iterate over the lattice to get the arcs
   StateId max_id = 0;
@@ -72,11 +72,11 @@ bool ClusterLattice(CompactLattice *clat,
   //   the cluster heads is to take the first one as a cluster head; then go
   //   till we find the next one that doesn't overlap in time with the current
   //   cluster head, and so on.
-  unordered_map<StateId, vector<Interval> >::iterator iter;
+  unordered_map<StateId, std::vector<Interval> >::iterator iter;
   for (iter = head.begin(); iter != head.end(); ++iter) {
     // For this ilabel, sort all the arcs on time, from first to last.
     sort(iter->second.begin(), iter->second.end(), CompareInterval);
-    vector<Interval> tmp;
+    std::vector<Interval> tmp;
     tmp.push_back(iter->second[0]);
     for (int32 i = 1; i < iter->second.size(); i++) {
       if (tmp.back().End() <= iter->second[i].Start())
@@ -158,7 +158,7 @@ class CompactLatticeToKwsProductFstMapper {
 
 
 bool CreateFactorTransducer(const CompactLattice &clat,
-                            const vector<int32> &state_times,
+                            const std::vector<int32> &state_times,
                             int32 utterance_id,
                             KwsProductFst *factor_transducer) {
   using namespace fst;
@@ -166,8 +166,8 @@ bool CreateFactorTransducer(const CompactLattice &clat,
 
   // We first compute the alphas and betas
   bool success = false;
-  vector<double> alpha;
-  vector<double> beta;
+  std::vector<double> alpha;
+  std::vector<double> beta;
   success = ComputeCompactLatticeAlphas(clat, &alpha);
   success = success && ComputeCompactLatticeBetas(clat, &beta);
   if (!success)
@@ -263,7 +263,7 @@ bool CreateFactorTransducer(const CompactLattice &clat,
 }
 
 void RemoveLongSilences(int32 max_silence_frames,
-                        const vector<int32> &state_times,
+                        const std::vector<int32> &state_times,
                         KwsProductFst *factor_transducer) {
   using namespace fst;
   typedef KwsProductArc::StateId StateId;
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index c58b2ec32b8..c95af70d7eb 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -67,12 +67,12 @@ BaseFloat LatticeForwardBackward(const Lattice &lat,
 // the CompactLattice lattice format. Also we only need the alpha in the forward
 // path, not the posteriors.
 bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
-                                 vector<double> *alpha);
+                                 std::vector<double> *alpha);
 
 // A sibling of the function CompactLatticeAlphas()... We compute the beta from
 // the backward path here.
 bool ComputeCompactLatticeBetas(const CompactLattice &lat,
-                                vector<double> *beta);
+                                std::vector<double> *beta);
 
 
 // Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
@@ -82,8 +82,8 @@ bool ComputeCompactLatticeBetas(const CompactLattice &lat,
 template<typename LatticeType>
 double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
                                     bool viterbi,
-                                    vector<double> *alpha,
-                                    vector<double> *beta);
+                                    std::vector<double> *alpha,
+                                    std::vector<double> *beta);
 
 
 /// Topologically sort the compact lattice if not already topologically sorted.
@@ -321,4 +321,3 @@ void ComposeCompactLatticeDeterministic(
 }  // namespace kaldi
 
 #endif  // KALDI_LAT_LATTICE_FUNCTIONS_H_
-
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index 53678efe844..e6fd0b61dd9 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -25,7 +25,7 @@ namespace kaldi {
 
 // this is Figure 6 in the paper.
 void MinimumBayesRisk::MbrDecode() {
-  
+
   for (size_t counter = 0; ; counter++) {
     NormalizeEps(&R_);
     AccStats(); // writes to gamma_
@@ -33,13 +33,13 @@ void MinimumBayesRisk::MbrDecode() {
 
     one_best_times_.clear();
     one_best_confidences_.clear();
-    
+
     // Caution: q in the line below is (q-1) in the algorithm
     // in the paper; both R_ and gamma_ are indexed by q-1.
     for (size_t q = 0; q < R_.size(); q++) {
-      if (do_mbr_) { // This loop updates R_ [indexed same as gamma_]. 
+      if (do_mbr_) { // This loop updates R_ [indexed same as gamma_].
         // gamma_[i] is sorted in reverse order so most likely one is first.
-        const vector<pair<int32, BaseFloat> > &this_gamma = gamma_[q];
+        const std::vector<std::pair<int32, BaseFloat> > &this_gamma = gamma_[q];
         double old_gamma = 0, new_gamma = this_gamma[0].second;
         int32 rq = R_[q], rhat = this_gamma[0].first; // rq: old word, rhat: new.
         for (size_t j = 0; j < this_gamma.size(); j++)
@@ -71,7 +71,7 @@ void MinimumBayesRisk::MbrDecode() {
 struct Int32IsZero {
   bool operator() (int32 i) { return (i == 0); }
 };
-// static 
+// static
 void MinimumBayesRisk::RemoveEps(std::vector<int32> *vec) {
   Int32IsZero pred;
   vec->erase(std::remove_if (vec->begin(), vec->end(), pred),
@@ -96,7 +96,7 @@ double MinimumBayesRisk::EditDistance(int32 N, int32 Q,
                                       Vector<double> &alpha_dash_arc) {
   alpha(1) = 0.0; // = log(1).  Line 5.
   alpha_dash(1, 0) = 0.0; // Line 5.
-  for (int32 q = 1; q <= Q; q++) 
+  for (int32 q = 1; q <= Q; q++)
     alpha_dash(1, q) = alpha_dash(1, q-1) + l(0, r(q)); // Line 7.
   for (int32 n = 2; n <= N; n++) {
     double alpha_n = kLogZeroDouble;
@@ -132,7 +132,7 @@ double MinimumBayesRisk::EditDistance(int32 N, int32 Q,
 // Figure 5 in the paper.
 void MinimumBayesRisk::AccStats() {
   using std::map;
-  
+
   int32 N = static_cast<int32>(pre_.size()) - 1,
       Q = static_cast<int32>(R_.size());
 
@@ -141,8 +141,8 @@ void MinimumBayesRisk::AccStats() {
   Vector<double> alpha_dash_arc(Q+1); // index 0...Q
   Matrix<double> beta_dash(N+1, Q+1); // index (1...N, 0...Q)
   Vector<double> beta_dash_arc(Q+1); // index 0...Q
-  vector<char> b_arc(Q+1); // integer in {1,2,3}; index 1...Q
-  vector<map<int32, double> > gamma(Q+1); // temp. form of gamma.
+  std::vector<char> b_arc(Q+1); // integer in {1,2,3}; index 1...Q
+  std::vector<map<int32, double> > gamma(Q+1); // temp. form of gamma.
   // index 1...Q [word] -> occ.
 
   // The tau arrays below are the sums over words of the tau_b
@@ -151,7 +151,7 @@ void MinimumBayesRisk::AccStats() {
   // the sausage bins, not specifically for the 1-best output.
   Vector<double> tau_b(Q+1), tau_e(Q+1);
 
-  double Ltmp = EditDistance(N, Q, alpha, alpha_dash, alpha_dash_arc); 
+  double Ltmp = EditDistance(N, Q, alpha, alpha_dash, alpha_dash_arc);
   if (L_ != 0 && Ltmp > L_) { // L_ != 0 is to rule out 1st iter.
     KALDI_WARN << "Edit distance increased: " << Ltmp << " > "
                << L_;
@@ -262,7 +262,7 @@ void MinimumBayesRisk::AccStats() {
       double avg = 0.5 * (times_[q-2].second + times_[q-1].first);
       times_[q-2].second = times_[q-1].first = avg;
     }
-  }  
+  }
 }
 
 void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
@@ -271,7 +271,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
   CreateSuperFinal(clat); // Add super-final state to clat... this is
   // one of the requirements of the MBR algorithm, as mentioned in the
   // paper (i.e. just one final state).
-  
+
   // Topologically sort the lattice, if not already sorted.
   kaldi::uint64 props = clat->Properties(fst::kFstProperties, false);
   if (!(props & fst::kTopSorted)) {
@@ -283,7 +283,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
   state_times_.push_back(0); // we'll convert to 1-based numbering.
   for (size_t i = state_times_.size()-1; i > 0; i--)
     state_times_[i] = state_times_[i-1];
-  
+
   // Now we convert the information in "clat" into a special internal
   // format (pre_, post_ and arcs_) which allows us to access the
   // arcs preceding any given state.
@@ -343,9 +343,9 @@ MinimumBayesRisk::MinimumBayesRisk(const CompactLattice &clat_in, bool do_mbr):
     L_ = 0.0; // Set current edit-distance to 0 [just so we know
     // when we're on the 1st iter.]
   }
-  
+
   MbrDecode();
-  
+
 }
 
 MinimumBayesRisk::MinimumBayesRisk(const CompactLattice &clat_in,
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index 799a7f6ce67..80c4e3e05d4 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -67,7 +67,7 @@ void MapWildCards(const LabelSet &wildcards, fst::StdVectorFst *ofst) {
       LabelSet::const_iterator it = wildcards.find(arc.ilabel);
       if (it != wildcards.end()) {
         KALDI_VLOG(4) << "MapWildCards: mapping symbol " << arc.ilabel
-                      << " to epsilon" << endl;
+                      << " to epsilon" << std::endl;
         arc.ilabel = 0;
       }
       it = wildcards.find(arc.olabel);
@@ -173,7 +173,7 @@ void CountErrors(const fst::StdVectorFst &fst,
 bool CheckFst(const fst::StdVectorFst &fst, string name, string key) {
 #ifdef DEBUG
   StateId numstates = fst.NumStates();
-  cerr << " " << name << " has " <<numstates << " states" <<endl;
+  std::cerr << " " << name << " has " << numstates << " states" << std::endl;
   std::stringstream ss;
   ss << name << key << ".fst";
   fst.Write(ss.str());
@@ -285,7 +285,7 @@ int main(int argc, char *argv[]) {
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       const Lattice &lat = lattice_reader.Value();
-      cerr << "Lattice " << key << " read." << endl;
+      std::cerr << "Lattice " << key << " read." << std::endl;
 
       // remove all weights while creating a standard FST
       VectorFst<StdArc> lattice_fst;
diff --git a/src/latbin/lattice-rescore-mapped.cc b/src/latbin/lattice-rescore-mapped.cc
index 4dd8dfd875c..9dcc63219ee 100644
--- a/src/latbin/lattice-rescore-mapped.cc
+++ b/src/latbin/lattice-rescore-mapped.cc
@@ -1,7 +1,7 @@
 // latbin/lattice-rescore-mapped.cc
 
 // Copyright 2009-2012   Saarland University (author: Arnab Ghoshal)
-//                       Johns Hopkins University (author: Daniel Povey)   
+//                       Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -91,7 +91,7 @@ int main(int argc, char *argv[]) {
         "Usage: lattice-rescore-mapped [options] <transition-model-in> <lattice-rspecifier> "
         "<loglikes-rspecifier> <lattice-wspecifier>\n"
         " e.g.: nnet-logprob [args] .. | lattice-rescore-mapped final.mdl ark:1.lats ark:- ark:2.lats\n";
-    
+
     kaldi::BaseFloat old_acoustic_scale = 0.0;
     kaldi::ParseOptions po(usage);
     po.Register("old-acoustic-scale", &old_acoustic_scale,
@@ -116,12 +116,12 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
       // Ignore what follows it in the model.
     }
-    
+
     RandomAccessBaseFloatMatrixReader loglike_reader(loglike_rspecifier);
     // Read as regular lattice
     SequentialLatticeReader lattice_reader(lats_rspecifier);
     // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
     int32 num_done = 0, num_err = 0;
     int64 num_frames = 0;
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
           KALDI_ERR << "Cycles detected in lattice.";
       }
 
-      vector<int32> state_times;
+      std::vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(lat, &state_times);
       const Matrix<BaseFloat> &log_likes = loglike_reader.Value(key);
       if (log_likes.NumRows() != max_time) {
@@ -154,7 +154,7 @@ int main(int argc, char *argv[]) {
         num_err++;
         continue;
       }
-      
+
       kaldi::LatticeAcousticRescore(trans_model, log_likes, state_times,
                                     &lat);
       CompactLattice clat_out;
diff --git a/src/latbin/nbest-to-linear.cc b/src/latbin/nbest-to-linear.cc
index 79da978e086..6b3fe5e1d01 100644
--- a/src/latbin/nbest-to-linear.cc
+++ b/src/latbin/nbest-to-linear.cc
@@ -40,7 +40,7 @@ int main(int argc, char *argv[]) {
         "[<transcriptions-wspecifier> [<lm-cost-wspecifier> [<ac-cost-wspecifier>]]]\n"
         " e.g.: lattice-to-nbest --n=10 ark:1.lats ark:- | \\\n"
         "   nbest-to-linear ark:1.lats ark,t:1.ali ark,t:1.tra\n";
-    
+
     ParseOptions po(usage);
 
     po.Read(argc, argv);
@@ -62,17 +62,17 @@ int main(int argc, char *argv[]) {
     Int32VectorWriter trans_writer(trans_wspecifier);
     BaseFloatWriter lm_cost_writer(lm_cost_wspecifier);
     BaseFloatWriter ac_cost_writer(ac_cost_wspecifier);
-    
+
     int32 n_done = 0, n_err = 0;
-    
+
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       Lattice lat = lattice_reader.Value();
 
-      vector<int32> ilabels;
-      vector<int32> olabels;
+      std::vector<int32> ilabels;
+      std::vector<int32> olabels;
       LatticeWeight weight;
-      
+
       if (!GetLinearSymbolSequence(lat, &ilabels, &olabels, &weight)) {
         KALDI_WARN << "Lattice/nbest for key " << key << " had wrong format: "
             "note, this program expects input with one path, e.g. from "
diff --git a/src/nnet2bin/nnet-am-average.cc b/src/nnet2bin/nnet-am-average.cc
index 0fa00f05995..d35375f44f2 100644
--- a/src/nnet2bin/nnet-am-average.cc
+++ b/src/nnet2bin/nnet-am-average.cc
@@ -29,7 +29,7 @@ namespace kaldi {
 
 void GetWeights(const std::string &weights_str,
                 int32 num_inputs,
-                vector<BaseFloat> *weights) {
+                std::vector<BaseFloat> *weights) {
   KALDI_ASSERT(num_inputs >= 1);
   if (!weights_str.empty()) {
     SplitStringToFloats(weights_str, ":", true, weights);
@@ -169,7 +169,7 @@ int main(int argc, char *argv[]) {
 
     int32 num_inputs = po.NumArgs() - 1;
 
-    vector<BaseFloat> model_weights;
+    std::vector<BaseFloat> model_weights;
     GetWeights(weights_str, num_inputs, &model_weights);
 
     int32 c_begin = 0,
@@ -179,7 +179,7 @@ int main(int argc, char *argv[]) {
     KALDI_ASSERT(c_end != -1 && "Network has no updatable components.");
 
     int32 last_layer_idx = am_nnet1.GetNnet().NumComponents();
-    vector<bool> skip_layers = GetSkipLayers(skip_layers_str,
+    std::vector<bool> skip_layers = GetSkipLayers(skip_layers_str,
                                              0,
                                              last_layer_idx);
 
@@ -257,4 +257,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/nnet2bin/nnet-normalize-stddev.cc b/src/nnet2bin/nnet-normalize-stddev.cc
index 29e3cf8fb80..b23faef5fc1 100644
--- a/src/nnet2bin/nnet-normalize-stddev.cc
+++ b/src/nnet2bin/nnet-normalize-stddev.cc
@@ -47,13 +47,13 @@ int main(int argc, char *argv[]) {
     bool binary_write = true;
     BaseFloat stddev = 1.0;
     std::string reference_model_filename;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("stddev-from", &reference_model_filename, "Reference model");
     po.Register("stddev", &stddev, "Target standard deviation that we normalize "
                 "to (note: is overridden by --stddev-from option, if supplied)");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
 
     // Works out the layers that we would like to normalize: any affine or block
     // affine layers that are followed by pnorm and then renormalize layers.
-    vector<int32> identified_components;
+    std::vector<int32> identified_components;
     for (int32 c = 0; c < am_nnet.GetNnet().NumComponents() - 2; c++) {
       // Checks if the current layer is an affine layer or block affine layer.
       // Also includes PreconditionedAffineComponent and
@@ -89,13 +89,13 @@ int main(int argc, char *argv[]) {
         dynamic_cast<BlockAffineComponent*>(component);
       if (ac == NULL && bac == NULL)
         continue;
-      
+
       // Checks if the next layer is a pnorm layer.
       component = &(am_nnet.GetNnet().GetComponent(c + 1));
       PnormComponent *pc = dynamic_cast<PnormComponent*>(component);
       if (pc == NULL)
         continue;
-      
+
       // Checks if the layer after the pnorm layer is a NormalizeComponent
       // or a PowerComponent followed by a NormalizeComponent
       component = &(am_nnet.GetNnet().GetComponent(c + 2));
@@ -126,7 +126,7 @@ int main(int argc, char *argv[]) {
     }
 
     BaseFloat ref_stddev = 0.0;
-    
+
     // Normalizes the identified layers.
     for (int32 c = 0; c < identified_components.size(); c++) {
       ref_stddev = stddev;
@@ -150,7 +150,7 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(uc != NULL);
       Vector<BaseFloat> params(uc->GetParameterDim());
       uc->Vectorize(&params);
-      BaseFloat params_average = params.Sum() 
+      BaseFloat params_average = params.Sum()
           / static_cast<BaseFloat>(params.Dim());
       params.Add(-1.0 * params_average);
       BaseFloat params_stddev = sqrt(VecVec(params, params)
diff --git a/src/nnetbin/nnet-train-mmi-sequential.cc b/src/nnetbin/nnet-train-mmi-sequential.cc
index 02a94ff3979..2554d64287a 100644
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@@ -272,7 +272,7 @@ int main(int argc, char *argv[]) {
         }
       }
       // get the lattice length and times of states,
-      vector<int32> state_times;
+      std::vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
       // check duration of den. lattice,
       if (max_time != mat.NumRows()) {
diff --git a/src/nnetbin/nnet-train-mpe-sequential.cc b/src/nnetbin/nnet-train-mpe-sequential.cc
index 76b4110ca28..2ba14527142 100644
--- a/src/nnetbin/nnet-train-mpe-sequential.cc
+++ b/src/nnetbin/nnet-train-mpe-sequential.cc
@@ -276,7 +276,7 @@ int main(int argc, char *argv[]) {
         }
       }
       // get the lattice length and times of states
-      vector<int32> state_times;
+      std::vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
       // check for temporal length of denominator lattices
       if (max_time != mat.NumRows()) {
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index fcdab88408e..cdfc5948571 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -387,7 +387,7 @@ OnlineSilenceWeighting::OnlineSilenceWeighting(
     const OnlineSilenceWeightingConfig &config):
     trans_model_(trans_model), config_(config),
     num_frames_output_and_correct_(0) {
-  vector<int32> silence_phones;
+  std::vector<int32> silence_phones;
   SplitStringToIntegers(config.silence_phones_str, ":,", false,
                         &silence_phones);
   for (size_t i = 0; i < silence_phones.size(); i++)
@@ -514,7 +514,7 @@ void OnlineSilenceWeighting::GetDeltaWeights(
       frames_out = static_cast<int32>(frame_info_.size()) - begin_frame;
   // frames_out is the number of frames we will output.
   KALDI_ASSERT(frames_out >= 0);
-  vector<BaseFloat> frame_weight(frames_out, 1.0);
+  std::vector<BaseFloat> frame_weight(frames_out, 1.0);
   // we will frame_weight to the value silence_weight for silence frames and for
   // transition-ids that repeat with duration > max_state_duration.  Frames newer
   // than the most recent traceback will get a weight equal to the weight for the

From 7001487371bde987fb6a70d98dba8c3e3abb75c2 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 1 Dec 2016 14:11:26 -0800
Subject: [PATCH 067/213] Update fstext to support OpenFst-1.5.4.

OpenFst-1.5 replaces internal custom reference counting (RefCounter)
with C++11 smart pointers. This commit adds conditional compilation
directives to fstext headers to do the same when compiling against
OpenFst-1.5.
---
 src/fstext/context-fst-inl.h       |  6 ++++++
 src/fstext/context-fst.h           | 14 +++++++++++---
 src/fstext/table-matcher.h         | 26 +++++++++++++++++++-------
 src/fstext/trivial-factor-weight.h | 25 +++++++++++++++++++------
 4 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
index 5127e7ae584..9472e611f77 100644
--- a/src/fstext/context-fst-inl.h
+++ b/src/fstext/context-fst-inl.h
@@ -360,14 +360,20 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
 template<class Arc, class LabelT>
 ContextFst<Arc, LabelT>::ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset) {
   if (reset) {
+#ifdef HAVE_OPENFST_GE_10500
+    impl_ = std::make_shared<ContextFstImpl<Arc, LabelT> >(*(fst.impl_));
+#else
     impl_ = new ContextFstImpl<Arc, LabelT>(*(fst.impl_));
     // Copy constructor of ContextFstImpl.
     // Main use of calling with reset = true is to free up memory
     // (e.g. then you could delete original one).  Might be useful in transcription
     // expansion during training.
+#endif
   } else {
     impl_ = fst.impl_;
+#ifndef HAVE_OPENFST_GE_10500
     impl_->IncrRefCount();
+#endif
   }
 }
 
diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h
index 15cb0ef9fdb..0f2fe6c817d 100644
--- a/src/fstext/context-fst.h
+++ b/src/fstext/context-fst.h
@@ -244,7 +244,9 @@ class ContextFst : public Fst<Arc> {
 
   ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset = false);
 
-  virtual ~ContextFst() { if (!impl_->DecrRefCount()) delete impl_;  }
+#ifndef HAVE_OPENFST_GE_10500
+  virtual ~ContextFst() { if (!impl_->DecrRefCount()) delete impl_; }
+#endif
 
   virtual StateId Start() const { return impl_->Start(); }
 
@@ -307,13 +309,19 @@ class ContextFst : public Fst<Arc> {
 
   friend class CacheStateIterator<ContextFst<Arc> >;  // so it can see impl_.
  private:
+#ifdef HAVE_OPENFST_GE_10500
+  std::shared_ptr<ContextFstImpl<Arc, LabelT> > impl_;  // protected so CacheStateIterator
+  ContextFstImpl<Arc, LabelT> *GetImpl() const { return impl_.get(); }
+#else
   ContextFstImpl<Arc, LabelT> *impl_;  // protected so CacheStateIterator
   // Makes visible to friends.
   ContextFstImpl<Arc, LabelT> *GetImpl() const { return impl_; }
- // would be: ImplToFst<ContextFstImpl<Arc, LabelT> >::GetImpl();
- // but need to convert to using the ImplToFst stuff.
+  // would be: ImplToFst<ContextFstImpl<Arc, LabelT> >::GetImpl();
+  // but need to convert to using the ImplToFst stuff.
 
   void operator = (const ContextFstImpl<Arc> &fst);  // disallow
+#endif
+
 };
 
 /// Useful utility function for writing these vectors to disk.
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index aed821a8725..da23c83a546 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -86,7 +86,9 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual const FST &GetFst() const { return *fst_; }
 
   virtual ~TableMatcherImpl() {
+#ifndef HAVE_OPENFST_GE_10500
     assert(RefCount() == 0);
+#endif
     vector<ArcId> *const empty = ((vector<ArcId>*)(NULL)) + 1;  // special marker.
     for (size_t i = 0; i < tables_.size(); i++) {
       if (tables_[i] != NULL && tables_[i] != empty)
@@ -219,6 +221,7 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return props; } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
 
+#ifndef HAVE_OPENFST_GE_10500
   int RefCount() const {
     return ref_count_.count();
   }
@@ -230,8 +233,11 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   int DecrRefCount() {
     return ref_count_.Decr();
   }
+#endif
  private:
+#ifndef HAVE_OPENFST_GE_10500
   RefCounter ref_count_;        // Reference count
+#endif
 
   virtual void SetState_(StateId s) { SetState(s); }
   virtual bool Find_(Label label) { return Find(label); }
@@ -263,22 +269,26 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
   typedef StateId ArcId;  // Use this type to store arc offsets [it's actually size_t
   // in the Seek function of ArcIterator, but StateId should be big enough].
   typedef typename Arc::Weight Weight;
+  typedef TableMatcherImpl<F, BackoffMatcher> I;
 
   TableMatcher(const FST &fst, MatchType match_type,
                const TableMatcherOptions &opts = TableMatcherOptions()):
-      impl_(new TableMatcherImpl<F, BackoffMatcher>(fst, match_type, opts)) { }
-
+      impl_(new I(fst, match_type, opts)) { }
 
   TableMatcher(const TableMatcher<FST, BackoffMatcher> &matcher, bool safe):
       impl_(matcher.impl_) {
-    impl_->IncrRefCount();
+#ifndef HAVE_OPENFST_GE_10500
+      impl_->IncrRefCount();
+#endif
   }
 
   virtual const FST &GetFst() const { return impl_->GetFst(); }
 
+#ifndef HAVE_OPENFST_GE_10500
   virtual ~TableMatcher() {
     if (!impl_->DecrRefCount())   delete impl_;
   }
+#endif
 
   virtual MatchType Type(bool test) const { return impl_->Type(test);  }
 
@@ -301,7 +311,11 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return impl_->Properties(props); } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
  private:
-  TableMatcherImpl<F, BackoffMatcher> *impl_;
+#ifdef HAVE_OPENFST_GE_10500
+  std::shared_ptr<I> impl_;
+#else
+  I *impl_;
+#endif
 
   virtual void SetState_(StateId s) { impl_->SetState(s); }
   virtual bool Find_(Label label) { return impl_->Find(label); }
@@ -339,7 +353,7 @@ void TableCompose(const Fst<Arc> &ifst1, const Fst<Arc> &ifst2,
     *ofst = ComposeFst<Arc>(ifst1, ifst2, impl_opts);
   } else {
     assert(opts.table_match_type == MATCH_INPUT) ;
-    // ComposeFstImplOptions templated on matcher for fst1, matcher for fst2.    
+    // ComposeFstImplOptions templated on matcher for fst1, matcher for fst2.
     ComposeFstImplOptions<SortedMatcher<F>, TableMatcher<F> > impl_opts(nopts);
     impl_opts.matcher2 = new TableMatcher<F>(ifst2, MATCH_INPUT, opts);
     *ofst = ComposeFst<Arc>(ifst1, ifst2, impl_opts);
@@ -388,5 +402,3 @@ void TableCompose(const Fst<Arc> &ifst1, const Fst<Arc> &ifst2,
 
 } // end namespace fst
 #endif
-
-
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index 109ba75ce10..b8afa757b39 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -353,10 +353,18 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
   typedef TrivialFactorWeightFstImpl<A, F> Impl;
 
   TrivialFactorWeightFst(const Fst<A> &fst)
+#ifdef HAVE_OPENFST_GE_10500
+      : ImplToFst<Impl>(std::make_shared<Impl>(fst, TrivialFactorWeightOptions<A>())) {}
+#else
       : ImplToFst<Impl>(new Impl(fst, TrivialFactorWeightOptions<A>())) {}
+#endif
 
   TrivialFactorWeightFst(const Fst<A> &fst,  const TrivialFactorWeightOptions<A> &opts)
+#ifdef HAVE_OPENFST_GE_10500
+      : ImplToFst<Impl>(std::make_shared<Impl>(fst, opts)) {}
+#else
       : ImplToFst<Impl>(new Impl(fst, opts)) {}
+#endif
 
   // See Fst<>::Copy() for doc.
   TrivialFactorWeightFst(const TrivialFactorWeightFst<A, F> &fst, bool copy)
@@ -370,12 +378,18 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
   virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
 
   virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
-    GetImpl()->InitArcIterator(s, data);
+    GetMutableImpl()->InitArcIterator(s, data);
   }
 
  private:
   // Makes visible to friends.
-  Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
+#ifdef HAVE_OPENFST_GE_10500
+  using ImplToFst<Impl>::GetImpl;
+  using ImplToFst<Impl>::GetMutableImpl;
+#else
+  const Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
+  Impl *GetMutableImpl() const { return ImplToFst<Impl>::GetImpl(); }
+#endif
 
   void operator=(const TrivialFactorWeightFst<A, F> &fst);  // Disallow
 };
@@ -387,7 +401,7 @@ class StateIterator< TrivialFactorWeightFst<A, F> >
     : public CacheStateIterator< TrivialFactorWeightFst<A, F> > {
  public:
   explicit StateIterator(const TrivialFactorWeightFst<A, F> &fst)
-      : CacheStateIterator< TrivialFactorWeightFst<A, F> >(fst, fst.GetImpl()) {}
+      : CacheStateIterator< TrivialFactorWeightFst<A, F> >(fst, fst.GetMutableImpl()) {}
 };
 
 
@@ -399,9 +413,9 @@ class ArcIterator< TrivialFactorWeightFst<A, F> >
   typedef typename A::StateId StateId;
 
   ArcIterator(const TrivialFactorWeightFst<A, F> &fst, StateId s)
-      : CacheArcIterator< TrivialFactorWeightFst<A, F> >(fst.GetImpl(), s) {
+      : CacheArcIterator< TrivialFactorWeightFst<A, F> >(fst.GetMutableImpl(), s) {
     if (!fst.GetImpl()->HasArcs(s))
-      fst.GetImpl()->Expand(s);
+      fst.GetMutableImpl()->Expand(s);
   }
 
  private:
@@ -420,4 +434,3 @@ void TrivialFactorWeightFst<A, F>::InitStateIterator(StateIteratorData<A> *data)
 }  // namespace fst
 
 #endif
-

From f2f6dc4119429fcf4f89c7abd49465ece3e53244 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 1 Dec 2016 16:01:55 -0800
Subject: [PATCH 068/213] Add support for API changes in OpenFst-1.5.

---
 src/fstext/lattice-utils-inl.h  | 13 +++++++++----
 src/fstext/lattice-weight.h     |  2 +-
 src/lat/arctic-weight.h         |  8 ++++----
 src/latbin/lattice-compose.cc   |  7 ++++++-
 src/latbin/lattice-lmrescore.cc |  7 ++++++-
 5 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index a3f603aa274..0f19b2b9513 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -209,12 +209,12 @@ void ScaleLattice(
          !aiter.Done();
          aiter.Next()) {
       Arc arc = aiter.Value();
-      arc.weight = ScaleTupleWeight(arc.weight, scale);
+      arc.weight = Weight(ScaleTupleWeight(arc.weight, scale));
       aiter.SetValue(arc);
     }
     Weight final_weight = fst->Final(s);
     if (final_weight != Weight::Zero())
-      fst->SetFinal(s, ScaleTupleWeight(final_weight, scale));
+      fst->SetFinal(s, Weight(ScaleTupleWeight(final_weight, scale)));
   }
 }
 
@@ -267,10 +267,15 @@ void ConvertFstToLattice(
     const ExpandedFst<ArcTpl<TropicalWeight> > &ifst,
     MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst) {
   int32 num_states_cache = 50000;
-  CacheOptions cache_opts(true, num_states_cache);
+#ifdef HAVE_OPENFST_GE_10500
+  fst::CacheOptions cache_opts(true, num_states_cache);
+  fst::MapFstOptions mapfst_opts(cache_opts);
+#else
+  fst::CacheOptions mapfst_opts(true, num_states_cache);
+#endif
   StdToLatticeMapper<Real> mapper;
   MapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
-         StdToLatticeMapper<Real> > map_fst(ifst, mapper, cache_opts);
+         StdToLatticeMapper<Real> > map_fst(ifst, mapper, mapfst_opts);
   *ofst = map_fst;
 }
 
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index 8453b9c5670..3a03733cb3d 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -748,7 +748,7 @@ inline CompactLatticeWeightTpl<Weight, IntType> ScaleTupleWeight(
     const CompactLatticeWeightTpl<Weight, IntType> &w,
     const vector<vector<ScaleFloatType> > &scale) {
   return CompactLatticeWeightTpl<Weight, IntType>(
-      ScaleTupleWeight(w.Weight(), scale), w.String());
+      Weight(ScaleTupleWeight(w.Weight(), scale)), w.String());
 }
 
 /** Define some ConvertLatticeWeight functions that are used in various lattice
diff --git a/src/lat/arctic-weight.h b/src/lat/arctic-weight.h
index 7806cec96d1..2b308f44e65 100644
--- a/src/lat/arctic-weight.h
+++ b/src/lat/arctic-weight.h
@@ -27,8 +27,8 @@ namespace fst {
 
 // Arctic semiring: (max, +, inf, 0)
 // We define the Arctic semiring T' = (R \cup {-inf, +inf}, max, +, -inf, 0).
-// The term "Arctic" came from Keith Kintzley (kintzley@jhu.edu), as opposite 
-// to the Tropical semiring. 
+// The term "Arctic" came from Keith Kintzley (kintzley@jhu.edu), as opposite
+// to the Tropical semiring.
 template <class T>
 class ArcticWeightTpl : public FloatWeightTpl<T> {
  public:
@@ -49,7 +49,7 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
     return ArcticWeightTpl<T>(0.0F); }
 
   static const string &Type() {
-    static const string type = "arctic" +
+    static const string type = string("arctic") +
         FloatWeightTpl<T>::GetPrecisionString();
     return type;
   }
@@ -57,7 +57,7 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
   static ArcticWeightTpl<T> NoWeight() {
     return ArcticWeightTpl<T>(numeric_limits<T>::infinity());
   }
-  
+
   bool Member() const {
     // First part fails for IEEE NaN
     return Value() == Value() && Value() != numeric_limits<T>::infinity();
diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc
index 5feb958a6a1..2d1415eede5 100644
--- a/src/latbin/lattice-compose.cc
+++ b/src/latbin/lattice-compose.cc
@@ -85,10 +85,15 @@ int main(int argc, char *argv[]) {
       if (phi_label > 0)
         PropagateFinal(phi_label, fst2);
 
+#ifdef HAVE_OPENFST_GE_10500
       fst::CacheOptions cache_opts(true, num_states_cache);
+      fst::MapFstOptions mapfst_opts(cache_opts);
+#else
+      fst::CacheOptions mapfst_opts(true, num_states_cache);
+#endif
       fst::StdToLatticeMapper<BaseFloat> mapper;
       fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
-          mapped_fst2(*fst2, mapper, cache_opts);
+          mapped_fst2(*fst2, mapper, mapfst_opts);
       for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
         std::string key = lattice_reader1.Key();
         KALDI_VLOG(1) << "Processing lattice for key " << key;
diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc
index b8f1067e607..10de27c43fc 100644
--- a/src/latbin/lattice-lmrescore.cc
+++ b/src/latbin/lattice-lmrescore.cc
@@ -74,10 +74,15 @@ int main(int argc, char *argv[]) {
     // mapped_fst is the LM fst interpreted using the LatticeWeight semiring,
     // with all the cost on the first member of the pair (since it's a graph
     // weight).
+#ifdef HAVE_OPENFST_GE_10500
     fst::CacheOptions cache_opts(true, num_states_cache);
+    fst::MapFstOptions mapfst_opts(cache_opts);
+#else
+    fst::CacheOptions mapfst_opts(true, num_states_cache);
+#endif
     fst::StdToLatticeMapper<BaseFloat> mapper;
     fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
-        lm_fst(*std_lm_fst, mapper, cache_opts);
+        lm_fst(*std_lm_fst, mapper, mapfst_opts);
     delete std_lm_fst;
 
     // The next fifteen or so lines are a kind of optimization and

From e7c323399028e009ca3592a5d18413f7a20a7c42 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Fri, 2 Dec 2016 15:48:24 -0800
Subject: [PATCH 069/213] Rework OpenFst related preprocessor conditionals.

---
 src/bin/phones-to-prons.cc                 |  2 +-
 src/fstext/context-fst-inl.h               | 19 ++++----
 src/fstext/context-fst-test.cc             |  6 +--
 src/fstext/context-fst.h                   | 12 +++---
 src/fstext/determinize-lattice-test.cc     | 12 +++---
 src/fstext/determinize-star-test.cc        | 50 +++++++++++-----------
 src/fstext/factor-test.cc                  |  4 +-
 src/fstext/fstext-utils-test.cc            |  6 +--
 src/fstext/kaldi-fst-io-inl.h              |  2 +-
 src/fstext/lattice-utils-inl.h             |  2 +-
 src/fstext/lattice-utils-test.cc           |  8 ++--
 src/fstext/pre-determinize-test.cc         | 16 +++----
 src/fstext/prune-special-test.cc           |  6 +--
 src/fstext/push-special-test.cc            |  4 +-
 src/fstext/remove-eps-local-test.cc        |  8 ++--
 src/fstext/table-matcher-test.cc           | 16 +++----
 src/fstext/table-matcher.h                 | 17 +++++---
 src/fstext/trivial-factor-weight-test.cc   | 20 ++++-----
 src/fstext/trivial-factor-weight.h         | 10 ++---
 src/lat/determinize-lattice-pruned-test.cc | 12 +++---
 src/lat/kaldi-lattice.cc                   |  4 +-
 src/lat/push-lattice-test.cc               |  4 +-
 src/latbin/lattice-compose.cc              |  2 +-
 src/latbin/lattice-lmrescore.cc            |  2 +-
 24 files changed, 127 insertions(+), 117 deletions(-)

diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index f9b9291a90b..33a821ce6ab 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -170,7 +170,7 @@ int main(int argc, char *argv[]) {
                    << "not reach end-state, or mismatched lexicon.)";
         if (g_kaldi_verbose_level >= 2) {
           KALDI_LOG << "phn2word FST is below:";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
           fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t");
 #else
           fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
index 9472e611f77..204c8b92c1f 100644
--- a/src/fstext/context-fst-inl.h
+++ b/src/fstext/context-fst-inl.h
@@ -42,7 +42,8 @@ typename ContextFstImpl<Arc, LabelT>::StateId
   if (iter == state_map_.end()) {  // Not already in map.
     StateId this_state_id = (StateId)state_seqs_.size();
     //This check is not needed with OpenFst >= 1.4
-#ifndef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
+#else
     StateId this_state_id_check = CacheImpl<Arc>::AddState();
     // goes back to VectorFstBaseImpl<Arc>, inherited via CacheFst<Arc>
     assert(this_state_id == this_state_id_check);
@@ -325,7 +326,7 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
   // We just try adding all possible symbols on the output side.
   Arc arc;
   if (this->CreateArc(s, subsequential_symbol_, &arc)) {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     this->PushArc(s, arc);
 #else
     this->AddArc(s, arc);
@@ -335,7 +336,7 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
        iter != phone_syms_.end(); ++iter) {
     Label phone = *iter;
     if (this->CreateArc(s, phone, &arc)) {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       this->PushArc(s, arc);
 #else
       this->AddArc(s, arc);
@@ -346,7 +347,7 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
        iter != disambig_syms_.end(); ++iter) {
     Label disambig_sym = *iter;
     if (this->CreateArc(s, disambig_sym, &arc)) {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       this->PushArc(s, arc);
 #else
       this->AddArc(s, arc);
@@ -359,22 +360,24 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
 
 template<class Arc, class LabelT>
 ContextFst<Arc, LabelT>::ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset) {
+#if OPENFST_VER >= 10500
   if (reset) {
-#ifdef HAVE_OPENFST_GE_10500
     impl_ = std::make_shared<ContextFstImpl<Arc, LabelT> >(*(fst.impl_));
+  } else {
+    impl_ = fst.impl_;
+  }
 #else
+  if (reset) {
     impl_ = new ContextFstImpl<Arc, LabelT>(*(fst.impl_));
     // Copy constructor of ContextFstImpl.
     // Main use of calling with reset = true is to free up memory
     // (e.g. then you could delete original one).  Might be useful in transcription
     // expansion during training.
-#endif
   } else {
     impl_ = fst.impl_;
-#ifndef HAVE_OPENFST_GE_10500
     impl_->IncrRefCount();
-#endif
   }
+#endif
 }
 
 
diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc
index 53c774f829a..72b50da1339 100644
--- a/src/fstext/context-fst-test.cc
+++ b/src/fstext/context-fst-test.cc
@@ -192,7 +192,7 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
   }
 
   if (verbose) {  // Try to print the fst.
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(cfst, cfst.InputSymbols(), cfst.OutputSymbols(), NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(cfst, cfst.InputSymbols(), cfst.OutputSymbols(), NULL, false, true);
@@ -211,7 +211,7 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
     if (verbose) {
       std::cout << "Sequence FST is:\n";
       {  // Try to print the fst.
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(*f, f->InputSymbols(), f->OutputSymbols(), NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(*f, f->InputSymbols(), f->OutputSymbols(), NULL, false, true);
@@ -257,7 +257,7 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
     if (verbose) {
       std::cout << "Composed FST is:\n";
       {  // Try to print the fst.
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(fst_composed, fst_composed.InputSymbols(),
                                    fst_composed.OutputSymbols(), NULL, false, true, "\t");
 #else
diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h
index 0f2fe6c817d..2d13e944f0a 100644
--- a/src/fstext/context-fst.h
+++ b/src/fstext/context-fst.h
@@ -94,7 +94,7 @@ class ContextFstImpl : public CacheImpl<Arc> {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Label Label;
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
 #endif
@@ -216,7 +216,8 @@ class ContextFst : public Fst<Arc> {
  public:
   friend class ArcIterator< ContextFst<Arc> >;
   friend class StateIterator< ContextFst<Arc> >;
-#ifndef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
+#else
   // We have to supply the default template argument below to work around a
   // Visual Studio bug.
   friend class CacheArcIterator< ContextFst<Arc>,
@@ -226,7 +227,7 @@ class ContextFst : public Fst<Arc> {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
 #else
@@ -244,7 +245,8 @@ class ContextFst : public Fst<Arc> {
 
   ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset = false);
 
-#ifndef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
+#else
   virtual ~ContextFst() { if (!impl_->DecrRefCount()) delete impl_; }
 #endif
 
@@ -309,7 +311,7 @@ class ContextFst : public Fst<Arc> {
 
   friend class CacheStateIterator<ContextFst<Arc> >;  // so it can see impl_.
  private:
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
   std::shared_ptr<ContextFstImpl<Arc, LabelT> > impl_;  // protected so CacheStateIterator
   ContextFstImpl<Arc, LabelT> *GetImpl() const { return impl_.get(); }
 #else
diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc
index a12e368ea86..42122c6e193 100644
--- a/src/fstext/determinize-lattice-test.cc
+++ b/src/fstext/determinize-lattice-test.cc
@@ -91,7 +91,7 @@ template<class Arc> void TestDeterminizeLattice() {
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
@@ -107,7 +107,7 @@ template<class Arc> void TestDeterminizeLattice() {
         throw std::runtime_error("could not determinize");
       std::cout << "FST after lattice-determinizing is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true);
@@ -122,7 +122,7 @@ template<class Arc> void TestDeterminizeLattice() {
       ConvertLattice<Weight, Int>(*fst, &compact_fst, false);
       std::cout << "Compact FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true);
@@ -137,7 +137,7 @@ template<class Arc> void TestDeterminizeLattice() {
       
       std::cout << "Compact version of determinized FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true);
@@ -162,7 +162,7 @@ template<class Arc> void TestDeterminizeLattice2() {
     VectorFst<Arc> *fst = RandFst<Arc>(opts);
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
@@ -173,7 +173,7 @@ template<class Arc> void TestDeterminizeLattice2() {
     DeterminizeLattice<TropicalWeight, int32>(*fst, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index d6aaaa4e024..f308d8460d8 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -37,7 +37,7 @@ template<class Arc> void TestDeterminizeGeneral() {
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
@@ -49,7 +49,7 @@ template<class Arc> void TestDeterminizeGeneral() {
       DeterminizeStar<Fst<Arc> >(*fst, &ofst, kDelta, NULL, max_states);
       std::cout << "FST after determinizing is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
@@ -108,7 +108,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -120,7 +120,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -137,7 +137,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -157,7 +157,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -180,7 +180,7 @@ template<class Arc>  void TestDeterminize() {
 
   {
     std::cout <<" printing after determinization [baseline]\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true);
@@ -191,7 +191,7 @@ template<class Arc>  void TestDeterminize() {
 
   {
     std::cout <<" printing after determinization [star]\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
@@ -205,7 +205,7 @@ template<class Arc>  void TestDeterminize() {
   int64 num_removed = DeleteISymbols(&ofst_star, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
@@ -277,7 +277,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -289,7 +289,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -310,7 +310,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing after pushing\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true);
@@ -367,7 +367,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -379,7 +379,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -396,7 +396,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -416,7 +416,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -433,7 +433,7 @@ template<class Arc>  void TestMinimize() {
   }
   {
     std::cout <<" printing after determinization [baseline]\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true);
@@ -449,7 +449,7 @@ template<class Arc>  void TestMinimize() {
     DeterminizeStar(*fst, &gallic_fst);
     {
       std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
@@ -463,7 +463,7 @@ template<class Arc>  void TestMinimize() {
 
     {
       std::cout <<" printing after pushing weights [in gallic]\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
@@ -476,7 +476,7 @@ template<class Arc>  void TestMinimize() {
     Minimize(&gallic_fst);
     {
       std::cout <<" printing after  minimization [in gallic]\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
@@ -485,7 +485,7 @@ template<class Arc>  void TestMinimize() {
     }
 
     printf("Converting gallic back to regular [my approach]\n");
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     TrivialFactorWeightFst< GallicArc<Arc, GALLIC_LEFT>, GallicFactor<typename Arc::Label,
         typename Arc::Weight, GALLIC_LEFT> > fwfst(gallic_fst);
 #else
@@ -494,7 +494,7 @@ template<class Arc>  void TestMinimize() {
 #endif
     {
       std::cout <<" printing factor-weight FST\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true);
@@ -502,7 +502,7 @@ template<class Arc>  void TestMinimize() {
       fstprinter.Print(&std::cout, "standard output");
     }
 
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
 #else
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, STRING_LEFT>());
@@ -510,7 +510,7 @@ template<class Arc>  void TestMinimize() {
 
     {
       std::cout <<" printing after converting back to regular FST\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
@@ -527,7 +527,7 @@ template<class Arc>  void TestMinimize() {
   int64 num_removed = DeleteISymbols(&ofst_star, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc
index 9416f6fa4a4..1d446796b05 100644
--- a/src/fstext/factor-test.cc
+++ b/src/fstext/factor-test.cc
@@ -78,7 +78,7 @@ template<class Arc> static void TestFactor() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
@@ -90,7 +90,7 @@ template<class Arc> static void TestFactor() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 7f63d83186b..494935d3622 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -146,7 +146,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -158,7 +158,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -376,7 +376,7 @@ void TestEqualAlign() {
 
 template<class Arc> void Print(const Fst<Arc> &fst, std::string message) {
   std::cout << message << "\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
   FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true, "\t");
 #else
   FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index 9185295bee6..58895449c72 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -42,7 +42,7 @@ void WriteFstKaldi(std::ostream &os, bool binary,
     // appear on its own line.
     os << '\n';
     bool acceptor = false, write_one = false;
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
                             NULL, acceptor, write_one, "\t");
 #else
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index 0f19b2b9513..f15e8d2cc57 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -267,7 +267,7 @@ void ConvertFstToLattice(
     const ExpandedFst<ArcTpl<TropicalWeight> > &ifst,
     MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst) {
   int32 num_states_cache = 50000;
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
   fst::CacheOptions cache_opts(true, num_states_cache);
   fst::MapFstOptions mapfst_opts(cache_opts);
 #else
diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc
index dc062343298..51df0ce8364 100644
--- a/src/fstext/lattice-utils-test.cc
+++ b/src/fstext/lattice-utils-test.cc
@@ -30,7 +30,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before converting to compact-arc is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
@@ -42,7 +42,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
 
     std::cout << "FST after converting is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
@@ -53,7 +53,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     std::cout << "FST after back conversion is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
@@ -78,7 +78,7 @@ template<class Weight, class Int> void TestShortestPath() {
       std::cout << "Testing shortest path\n";
       std::cout << "FST before converting to compact-arc is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc
index 8694267407b..774507b0792 100644
--- a/src/fstext/pre-determinize-test.cc
+++ b/src/fstext/pre-determinize-test.cc
@@ -69,7 +69,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -81,7 +81,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -99,7 +99,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -119,7 +119,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -133,7 +133,7 @@ template<class Arc>  void TestPreDeterminize() {
   Determinize(*fst, &ofst, opts);
   std::cout <<" printing after determinization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true);
@@ -144,7 +144,7 @@ template<class Arc>  void TestPreDeterminize() {
   int64 num_removed = DeleteISymbols(&ofst, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true);
@@ -200,7 +200,7 @@ template<class Arc>  void TestAddSelfLoops() {
   }
   std::cout <<" printing before adding self-loops\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true);
@@ -223,7 +223,7 @@ template<class Arc>  void TestAddSelfLoops() {
 
   std::cout <<" printing after adding self-loops\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true);
diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc
index e879a7593ac..cb55edca6cc 100644
--- a/src/fstext/prune-special-test.cc
+++ b/src/fstext/prune-special-test.cc
@@ -37,7 +37,7 @@ static void TestPruneSpecial() {
   float beam = 0.55;
 
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true);
@@ -50,7 +50,7 @@ static void TestPruneSpecial() {
   VectorFst<Arc> ofst1;
   PruneSpecial<StdArc>(*ifst, &ofst1, beam);
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true);
@@ -63,7 +63,7 @@ static void TestPruneSpecial() {
   VectorFst<Arc> ofst2;
   Prune(*ifst, &ofst2, beam);
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc
index 7f8ccbe92db..0106492e887 100644
--- a/src/fstext/push-special-test.cc
+++ b/src/fstext/push-special-test.cc
@@ -37,7 +37,7 @@ static void TestPushSpecial() {
   VectorFst<Arc> *fst = RandFst<StdArc>();
 
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
@@ -59,7 +59,7 @@ static void TestPushSpecial() {
 
 
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc
index 676ba82025c..2c6c6f8d97f 100644
--- a/src/fstext/remove-eps-local-test.cc
+++ b/src/fstext/remove-eps-local-test.cc
@@ -82,7 +82,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
@@ -99,7 +99,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
 
   {
     std::cout << "copy1 = \n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true);
@@ -148,7 +148,7 @@ static void TestRemoveEpsLocalSpecial() {
 #endif
   {
     std::cout << "logfst = \n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true);
@@ -167,7 +167,7 @@ static void TestRemoveEpsLocalSpecial() {
 
   {
     std::cout << "logfst2 = \n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc
index b9e8a864454..0124fff4147 100644
--- a/src/fstext/table-matcher-test.cc
+++ b/src/fstext/table-matcher-test.cc
@@ -63,7 +63,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
 
   std::cout <<"Table-Composed FST\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true);
@@ -73,7 +73,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
 
   std::cout <<" Baseline-Composed FST\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true);
@@ -86,7 +86,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     Difference(composed, composed_baseline, &diff1);
     std::cout <<" Diff1 (composed - baseline) \n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
@@ -99,7 +99,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     Difference(composed_baseline, composed, &diff2);
     std::cout <<" Diff2 (baseline - composed) \n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
@@ -164,7 +164,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       Difference(composed, composed_baseline, &diff1);
       std::cout <<" Diff1 (composed - baseline) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
@@ -177,7 +177,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       Difference(composed_baseline, composed, &diff2);
       std::cout <<" Diff2 (baseline - composed) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
@@ -242,7 +242,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       Difference(composed, composed_baseline, &diff1);
       std::cout <<" Diff1 (composed - baseline) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
@@ -255,7 +255,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       Difference(composed_baseline, composed, &diff2);
       std::cout <<" Diff2 (baseline - composed) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index da23c83a546..1a1b35d8c68 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -86,7 +86,8 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual const FST &GetFst() const { return *fst_; }
 
   virtual ~TableMatcherImpl() {
-#ifndef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
+#else
     assert(RefCount() == 0);
 #endif
     vector<ArcId> *const empty = ((vector<ArcId>*)(NULL)) + 1;  // special marker.
@@ -221,7 +222,8 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return props; } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
 
-#ifndef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
+#else
   int RefCount() const {
     return ref_count_.count();
   }
@@ -235,7 +237,8 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   }
 #endif
  private:
-#ifndef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
+#else
   RefCounter ref_count_;        // Reference count
 #endif
 
@@ -277,14 +280,16 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
 
   TableMatcher(const TableMatcher<FST, BackoffMatcher> &matcher, bool safe):
       impl_(matcher.impl_) {
-#ifndef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
+#else
       impl_->IncrRefCount();
 #endif
   }
 
   virtual const FST &GetFst() const { return impl_->GetFst(); }
 
-#ifndef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
+#else
   virtual ~TableMatcher() {
     if (!impl_->DecrRefCount())   delete impl_;
   }
@@ -311,7 +316,7 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return impl_->Properties(props); } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
  private:
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
   std::shared_ptr<I> impl_;
 #else
   I *impl_;
diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc
index af3f4a3de89..fcf34b6834e 100644
--- a/src/fstext/trivial-factor-weight-test.cc
+++ b/src/fstext/trivial-factor-weight-test.cc
@@ -70,7 +70,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -82,7 +82,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -97,7 +97,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -117,7 +117,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after double-epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
 #else
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
@@ -140,7 +140,7 @@ template<class Arc>  void TestFactor() {
 
     {
       std::cout <<" printing gallic FST\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
@@ -152,7 +152,7 @@ template<class Arc>  void TestFactor() {
     // Map(ofst_star, &gallic_fst, ToGallicMapper<Arc, STRING_LEFT>());
     
     printf("Converting gallic back to regular\n");
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     TrivialFactorWeightFst< GallicArc<Arc, GALLIC_LEFT>, GallicFactor<typename Arc::Label,
         typename Arc::Weight, GALLIC_LEFT> > fwfst(gallic_fst);
 #else
@@ -161,7 +161,7 @@ template<class Arc>  void TestFactor() {
 #endif
     {
       std::cout <<" printing factor-weight FST\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true);
@@ -169,7 +169,7 @@ template<class Arc>  void TestFactor() {
       fstprinter.Print(&std::cout, "standard output");
     }
 
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
 #else
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, STRING_LEFT>());
@@ -177,7 +177,7 @@ template<class Arc>  void TestFactor() {
 
     {
       std::cout <<" printing after converting back to regular FST\n";
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
@@ -187,7 +187,7 @@ template<class Arc>  void TestFactor() {
 
 
     VectorFst<GallicArc<Arc> > new_gallic_fst;
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     Map(ofst_star, &new_gallic_fst, ToGallicMapper<Arc, GALLIC_LEFT>());
 #else
     Map(ofst_star, &new_gallic_fst, ToGallicMapper<Arc, STRING_LEFT>());
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index b8afa757b39..3e42dd287db 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -117,7 +117,7 @@ class TrivialFactorWeightFstImpl
   typedef typename A::StateId StateId;
   typedef F FactorIterator;
 
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<A> Store;
   typedef typename Store::State State;
 #endif
@@ -344,7 +344,7 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
   typedef A Arc;
   typedef typename A::Weight Weight;
   typedef typename A::StateId StateId;
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
 #else
@@ -353,14 +353,14 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
   typedef TrivialFactorWeightFstImpl<A, F> Impl;
 
   TrivialFactorWeightFst(const Fst<A> &fst)
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
       : ImplToFst<Impl>(std::make_shared<Impl>(fst, TrivialFactorWeightOptions<A>())) {}
 #else
       : ImplToFst<Impl>(new Impl(fst, TrivialFactorWeightOptions<A>())) {}
 #endif
 
   TrivialFactorWeightFst(const Fst<A> &fst,  const TrivialFactorWeightOptions<A> &opts)
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
       : ImplToFst<Impl>(std::make_shared<Impl>(fst, opts)) {}
 #else
       : ImplToFst<Impl>(new Impl(fst, opts)) {}
@@ -383,7 +383,7 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
 
  private:
   // Makes visible to friends.
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
   using ImplToFst<Impl>::GetImpl;
   using ImplToFst<Impl>::GetMutableImpl;
 #else
diff --git a/src/lat/determinize-lattice-pruned-test.cc b/src/lat/determinize-lattice-pruned-test.cc
index d5f22454017..c932e3c95de 100644
--- a/src/lat/determinize-lattice-pruned-test.cc
+++ b/src/lat/determinize-lattice-pruned-test.cc
@@ -62,7 +62,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
@@ -79,7 +79,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
 
       std::cout << "FST after lattice-determinizing is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true);
@@ -100,7 +100,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       ConvertLattice<Weight, Int>(pruned_fst, &compact_pruned_fst, false);
       std::cout << "Compact pruned FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true);
@@ -111,7 +111,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       
       std::cout << "Compact version of determinized FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true, "\t");
 #else
         FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true);
@@ -138,7 +138,7 @@ template<class Arc> void TestDeterminizeLatticePruned2() {
     VectorFst<Arc> *fst = RandPairFst<Arc>(opts);
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
@@ -149,7 +149,7 @@ template<class Arc> void TestDeterminizeLatticePruned2() {
     DeterminizeLatticePruned<Weight>(*fst, 10.0, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
 #else
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index ee58e64704d..b44b12a5a23 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -75,7 +75,7 @@ bool WriteCompactLattice(std::ostream &os, bool binary,
     // on its own line.
     os << '\n';
     bool acceptor = true, write_one = false;
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
                                                t.OutputSymbols(),
                                                NULL, acceptor, write_one, "\t");
@@ -406,7 +406,7 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
     // on its own line.
     os << '\n';
     bool acceptor = false, write_one = false;
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
     fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
                                         t.OutputSymbols(),
                                         NULL, acceptor, write_one, "\t");
diff --git a/src/lat/push-lattice-test.cc b/src/lat/push-lattice-test.cc
index e1f99bcb31f..ecd60501888 100644
--- a/src/lat/push-lattice-test.cc
+++ b/src/lat/push-lattice-test.cc
@@ -90,7 +90,7 @@ void TestPushCompactLatticeWeights() {
     }
     if (!ApproxEqual(sum, LatticeWeight::One())) {
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
                                                    NULL, true, true, "\t");
 #else
@@ -100,7 +100,7 @@ void TestPushCompactLatticeWeights() {
         printer.Print(&std::cerr, "<unknown>");
       }
       {
-#ifdef HAVE_OPENFST_GE_10400
+#if OPENFST_VER >= 10400
         fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
                                                    NULL, true, true, "\t");
 #else
diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc
index 2d1415eede5..365be941a85 100644
--- a/src/latbin/lattice-compose.cc
+++ b/src/latbin/lattice-compose.cc
@@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
       if (phi_label > 0)
         PropagateFinal(phi_label, fst2);
 
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
       fst::CacheOptions cache_opts(true, num_states_cache);
       fst::MapFstOptions mapfst_opts(cache_opts);
 #else
diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc
index 10de27c43fc..d60d5fe93e5 100644
--- a/src/latbin/lattice-lmrescore.cc
+++ b/src/latbin/lattice-lmrescore.cc
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) {
     // mapped_fst is the LM fst interpreted using the LatticeWeight semiring,
     // with all the cost on the first member of the pair (since it's a graph
     // weight).
-#ifdef HAVE_OPENFST_GE_10500
+#if OPENFST_VER >= 10500
     fst::CacheOptions cache_opts(true, num_states_cache);
     fst::MapFstOptions mapfst_opts(cache_opts);
 #else

From 28e89e5446e4e9d2f69dd4e6142317cb2bd69655 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sat, 3 Dec 2016 22:08:52 -0800
Subject: [PATCH 070/213] Fix Minimize calls.

---
 src/kwsbin/kws-search.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index 836f7b9a111..6f1a5d763d4 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -316,7 +316,11 @@ int main(int argc, char *argv[]) {
       }
 
       Project(&result_fst, PROJECT_OUTPUT);
+#if OPENFST_VER >= 10500
+      Minimize(&result_fst, (KwsLexicographicFst *) nullptr, kDelta, true);
+#else
       Minimize(&result_fst);
+#endif
       ShortestPath(result_fst, &result_fst, n_best);
       RmEpsilon(&result_fst);
 

From c00caeca0eba131cfba87fbd625ee2678d69fd45 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Mon, 5 Dec 2016 14:49:40 -0800
Subject: [PATCH 071/213] Remove OpenFst related compiler flags in
 src/configure.

---
 src/configure | 279 +++++++++++---------------------------------------
 1 file changed, 59 insertions(+), 220 deletions(-)

diff --git a/src/configure b/src/configure
index c70fbc322d5..736689dc868 100755
--- a/src/configure
+++ b/src/configure
@@ -22,20 +22,6 @@
 # ./configure --atlas-root=../tools/ATLAS/build
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
 #                             # version of kaldi even on CUDA-enabled machine
-#        # Cross compile for armv8hf, this assumes that you have openfst built
-#        # with the armv8-rpi3-linux-gnueabihf toolchain and installed to
-#        # /opt/cross/armv8hf.  It also assumes that you have an ATLAS library
-#        # built for the target install to /opt/cross/armv8hf and that the
-#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path
-# ./configure --static --fst-root=/opt/cross/armv8hf --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
-#        # Cross compile for Android on arm
-#        # The only difference here is the addtion of the the --android-includes
-#        # flag because the toolchains produced by the Android NDK don't always
-#        # include the C++ stdlib headers in the normal cross compile include
-#        # path
-# ./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
-# --fst-root=/opt/cross/arm-linux-androideabi --host=arm-linux-androideabi \
-# --fst-version=1.4.1 --android-includes=/opt/cross/arm-linux-androideabi/sysroot/usr/include
 
 #This should be incremented after every significant change of the configure script
 #I.e. after each change that affects the kaldi.mk or the build system as whole
@@ -83,14 +69,12 @@ unset MKLROOT
 unset CLAPACKROOT
 unset OPENBLASROOT
 unset MKLLIBDIR
-unset HOST
 
 function usage {
   echo 'Usage: ./configure [--static|--shared] [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT]
   [--openblas-root=OPENBLASROOOT] [--clapack-root=CLAPACKROOT] [--mkl-root=MKLROOT] [--mkl-libdir=MKLLIBDIR]
   [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS]
-  [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR][--mkl-threading=sequential|iomp|tbb|gomp] [--fst-version=VERSION]
-  [--host=HOST] [--android-includes=ANDROID_INC_DIR]';
+  [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR][--mkl-threading=sequential|iomp|tbb|gomp]';
 }
 
 threaded_atlas=false #  By default, use the un-threaded version of ATLAS.
@@ -100,11 +84,6 @@ static_fst=false
 use_cuda=true
 dynamic_kaldi=false
 mkl_threading=sequential
-# HOST and TARGET_ARCH are used when cross compiling, the user will specify HOST via the --host
-# switch.  TARGET_ARCH will be the first value in HOST if set, and `uname -m` otherwise
-HOST=""
-TARGET_ARCH=""
-android=false
 
 cmd_line="$0 $@"  # Save the command line to include in kaldi.mk
 
@@ -205,47 +184,15 @@ do
   --cudatk-dir=*)
     CUDATKDIR=`read_dirname $1`;
     shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
-  --fst-version=*)
-    OPENFST_VER=`expr "X$1" : '[^=]*=\(.*\)'`;
-    shift;;
-  --host=*)
-    # This expects the same format of host "triple" as autotools based projects
-    # this script will infer the target architecture from the specified triple.
-    HOST=`expr "X$1" : '[^=]*=\(.*\)'`;
-    shift ;;
-  --android-includes=*)
-    threaded_math=false;
-    static_math=true;
-    static_fst=true;
-    dynamic_kaldi=false;
-    MATHLIB='OPENBLAS';
-    android=true;
-    ANDROIDINC=`read_dirname $1`;
-    shift;;
   *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
   esac
 done
 
-
 # the idea here is that if you change the configuration options from using
 # CUDA to not using it, or vice versa, we want to recompile all parts of the
 # code that may use a GPU.  Touching this file is a way to force this.
 touch cudamatrix/cu-common.h 2>/dev/null
 
-
-function add_cross_tools {
-  # If the $HOST variable is set, we need to tell make to use the specified tools
-  if [ ! -z "$HOST" ]; then
-    echo '# A host triple was specified, we need to prepend all the tools with it' >> kaldi.mk
-    echo "HOST = $HOST" >> kaldi.mk
-    echo 'CC := $(HOST)-$(CC)' >> kaldi.mk
-    echo 'CXX := $(HOST)-$(CXX)' >> kaldi.mk
-    echo 'AR := $(HOST)-$(AR)' >> kaldi.mk
-    echo 'AS := $(HOST)-$(AS)' >> kaldi.mk
-    echo 'RANLIB := $(HOST)-$(RANLIB)' >> kaldi.mk
-  fi
-}
-
 function failure {
   echo "***configure failed: $* ***" >&2
   if [ -f kaldi.mk ]; then rm kaldi.mk; fi
@@ -257,13 +204,8 @@ function check_exists {
 }
 
 function check_for_bad_gcc {
-  if [ -z "$HOST" ] ; then
-    compiler="gcc"
-  else
-    compiler="$HOST-gcc"
-  fi
-  if which $compiler >&/dev/null; then  # gcc is on the path
-    gcc_version=$($compiler -dumpspecs 2>&1 | grep -A1 -F '*version:' | grep -v version)
+  if which gcc >&/dev/null; then  # gcc is on the path
+    gcc_version=$(gcc -dumpspecs 2>&1 | grep -A1 -F '*version:' | grep -v version)
     if [ "$gcc_version" == "4.8.2" ] || [ "$gcc_version" == "4.8.1" ]; then
       echo "*** WARNING: your version of gcc seems to be 4.8.1 or 4.8.2. ***"
       echo "*** These versions of gcc has a bug in nth_element ***"
@@ -276,19 +218,16 @@ function check_for_bad_gcc {
 }
 
 function check_for_slow_expf {
-  # We cannot run this test if we are cross compiling.
-  if [[ "$TARGET_ARCH" == "`uname -m`" ]] ; then
-    cd probe
-    rm -f exp-test
-    make -f Makefile.slow_expf 1>/dev/null
-    ./exp-test
-    if [ $? -eq 1 ]; then
-        echo "*** WARNING: expf() seems to be slower than exp() on your machine. This is a known bug in old versions of glibc. Please consider updating glibc. ***"
-        echo "*** Kaldi will be configured to use exp() instead of expf() in base/kaldi-math.h Exp() routine for single-precision floats. ***"
-        echo "CXXFLAGS += -DKALDI_NO_EXPF" >> ../kaldi.mk
-    fi
-    cd ..
+  cd probe
+  rm -f exp-test
+  make -f Makefile.slow_expf 1>/dev/null
+  ./exp-test
+  if [ $? -eq 1 ]; then
+    echo "*** WARNING: expf() seems to be slower than exp() on your machine. This is a known bug in old versions of glibc. Please consider updating glibc. ***"
+    echo "*** Kaldi will be configured to use exp() instead of expf() in base/kaldi-math.h Exp() routine for single-precision floats. ***"
+    echo "CXXFLAGS += -DKALDI_NO_EXPF" >> ../kaldi.mk
   fi
+  cd ..
 }
 
 
@@ -312,6 +251,7 @@ function check_library {
 }
 
 
+
 #Check if at least one of these variables is set
 #If yes, we want to switch to using the MKL
 is_set $MKLLIBDIR && echo "Force-configuring KALDI to use MKL" && export MATHLIB="MKL"
@@ -319,32 +259,6 @@ is_set $MKLROOT && echo "Force-configuring KALDI to use MKL"&& export MATHLIB="M
 is_set $CLAPACKROOT && echo "Force-configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
 is_set $OPENBLASROOT && echo "Force-configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
 
-
-# If HOST is specified, parse the TARGET_ARCH, otherwise use uname -m
-if [[ "$HOST" == "" ]] ; then
-  TARGET_ARCH="`uname -m`"
-else
-  # The HOST value will be something like "armv8-rpi3-linux-gnueabihf" and we need the first value
-  # as delimited by '-' to be used as the TARGET_ARCH for this build.  The read command is the
-  # bash equivalent of split() found in other scripting languages.  read uses the value in
-  # environment variable IFS as the field delimiter.  The following command will take the
-  # host string "armv8-rpi3-linux-gnueabihf" as streamed in from the HOST variable
-  # and return ["armv8", "rpi3", "linux", "gnueabihf"] in PARTS
-  #
-  # Note that by changing the value of IFS (which is an environment variable) on the same
-  # line as the read invocation, it is only changed for that invocation and not for the shell
-  # executing this script.  So we do not need to cache and reset the value.
-  IFS='-' read -ra PARTS <<< "$HOST"
-  # We only want the first entry from the list as the architecture
-  TARGET_ARCH="$PARTS"
-  if [[ "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && "$TARGET_ARCH" != x86* ]] ; then
-    # We currently only support building for x86[_64], arm*, and ppc64le, if the
-    # TARGET_ARCH was read from the HOST variable, it must be one of these
-    failure "$TARGET_ARCH is an unsupported architecture, kaldi currently supports x86[_64], arm*, and ppc64le"
-  fi
-fi
-
-
 #MKL functions
 function linux_configure_mkllibdir {
   local mklroot=$1
@@ -519,11 +433,6 @@ function configure_cuda {
     if [ ! -f $CUDATKDIR/bin/nvcc ]; then
       failure "Cannnot find nvcc in CUDATKDIR=$CUDATKDIR"
     fi
-
-    if [[ "$TARGET_ARCH" != "`uname -m`" ]] ; then
-      failure "Cannot cross compile with CUDA support"
-    fi
-
     echo "Using CUDA toolkit $CUDATKDIR (nvcc compiler and runtime libraries)"
     echo >> kaldi.mk
     echo "#Next section enables CUDA for compilation" >> kaldi.mk
@@ -546,7 +455,7 @@ function configure_cuda {
     esac
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
 
-    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct calls to uname -m here
+    # 64bit/32bit?
     if [ "`uname -m`" == "x86_64" ]; then
       if [ "`uname`" == "Darwin" ]; then
         sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
@@ -605,13 +514,8 @@ function linux_configure_speex {
 }
 
 function fix_cxx_flag {
-  USINGGXX=`grep -c "CXX = g++" kaldi.mk`
-  if [ $USINGGXX -ge 1 ]; then
-    if [ -z "$HOST" ] ; then
-      CXXCOMPILER="g++"
-    else
-      CXXCOMPILER="$HOST-g++"
-    fi
+  CXXCOMPILER=`grep "CXX = " kaldi.mk | awk '{print $3}'`
+  if [ $CXXCOMPILER=="g++" ]; then
     $CXXCOMPILER -dumpversion | \
     awk '{if(NR==1 && $1<"4.4") print "sed \"s/-Wno-unused-local-typedefs//g\" \
     kaldi.mk > tmpf; mv tmpf kaldi.mk; "}' | sh -
@@ -622,9 +526,9 @@ function linux_atlas_failure { # function we use when we couldn't find
    # ATLAS libs.
    echo ATLASINC = $ATLASROOT/include >> kaldi.mk
    echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
-   if [[ "$TARGET_ARCH" == arm* ]]; then
+   if [[ "`uname -m`" == arm* ]]; then
      cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+   elif [[ "`uname -m`" == ppc64le ]]; then
      cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
    else
      cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -654,12 +558,7 @@ function linux_check_static {
   if [ -f $dir/libatlas.a ]; then # candidate...
     # Note: on the next line, the variable assignment
     # LANG=en_US should apply just to the program called on that line.
-    if [ -z "$HOST" ] ; then
-      compiler="gcc"
-    else
-      compiler="$HOST-gcc"
-    fi
-    if LANG=en_US $compiler -o test_linking test_linking.cc -u ATL_flushcache $dir/libatlas.a 2>&1 | grep -i "incompatible" >/dev/null; then
+    if LANG=en_US gcc -o test_linking test_linking.cc -u ATL_flushcache $dir/libatlas.a 2>&1 | grep -i "incompatible" >/dev/null; then
       echo "Directory $dir may contain ATLAS libraries but seems to be wrong architecture";
       rm test_linking test_linking.cc 2>/dev/null
       return 1;
@@ -684,15 +583,14 @@ function linux_configure_debian_ubuntu {
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
+  if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+   elif [[ "`uname -m`" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
@@ -710,15 +608,14 @@ function linux_configure_debian_ubuntu3 {
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
+  if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+  elif [[ "`uname -m`" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
@@ -739,15 +636,14 @@ function linux_configure_debian7 {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo
-  if [[ "$TARGET_ARCH" == arm* ]]; then
+  if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+  elif [[ "`uname -m`" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
@@ -765,15 +661,14 @@ function linux_configure_redhat {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo
-  if [[ "$TARGET_ARCH" == arm* ]]; then
+  if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+  elif [[ "`uname -m`" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   exit_success;
@@ -793,15 +688,14 @@ function linux_configure_redhat_fat {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo
-  if [[ "$TARGET_ARCH" == arm* ]]; then
+  if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+  elif [[ "`uname -m`" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   exit_success;
@@ -853,15 +747,14 @@ function linux_configure_static {
 
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
+  if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+  elif [[ "`uname -m`" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   fix_cxx_flag
-  add_cross_tools
   $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
@@ -939,15 +832,14 @@ function linux_configure_dynamic {
 
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
+  if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+  elif [[ "`uname -m`" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   fix_cxx_flag
-  add_cross_tools
   $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
@@ -957,7 +849,7 @@ function linux_configure_dynamic {
 echo "Configuring ..."
 
 if [ ! -f makefiles/common.mk ]; then
-    failure makefiles/common.mk not found
+  failure makefiles/common.mk not found
 fi
 
 # back up the old one in case we modified it
@@ -969,77 +861,26 @@ fi
 printf "# This file was generated using the following command:\n# $cmd_line\n\n" > kaldi.mk
 cat makefiles/common.mk >> kaldi.mk
 if $dynamic_kaldi ; then
-KALDILIBDIR=`pwd`/lib
-echo "KALDI_FLAVOR := dynamic" >> kaldi.mk
-echo "KALDILIBDIR := $KALDILIBDIR" >> kaldi.mk
+  KALDILIBDIR=`pwd`/lib
+  echo "KALDI_FLAVOR := dynamic" >> kaldi.mk
+  echo "KALDILIBDIR := $KALDILIBDIR" >> kaldi.mk
 fi
 echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
 echo "FSTROOT = $FSTROOT" >> kaldi.mk
 
 echo "Checking OpenFST library in $FSTROOT ..."
 if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
-    failure "Could not find file $FSTROOT/include/fst/fst.h:
-    you may not have installed OpenFst.  See ../tools/INSTALL"
+  failure "Could not find file $FSTROOT/include/fst/fst.h:
+  you may not have installed OpenFst.  See ../tools/INSTALL"
 fi
 
 OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
-echo "Adding flags necessary for compiling against OpenFst-$OPENFST_VER ..."
 echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
-OPENFST_VER_NUM=`echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"`
-echo "EXTRA_CXXFLAGS += -DOPENFST_VER=$OPENFST_VER_NUM" >> kaldi.mk
-if [ $OPENFST_VER_NUM -ge 10400 ]; then
-  echo "EXTRA_CXXFLAGS += -std=c++0x" >> kaldi.mk
-fi
-
-if [ $OPENFST_VER_NUM -lt 10500 ]; then
-  echo "Checking if OpenFst library was patched ..."
-  if ! grep "multiple repeated" $FSTROOT/include/fst/minimize.h >/dev/null; then
-    echo "**  ERROR  **"
-    echo "** $FSTROOT/include/fst/minimize.h seems not to be patched:"
-    echo "patch not applied?  FST tools will not work in our recipe."
-    exit 1;
-  fi
-fi
+echo "CXXFLAGS += -std=c++0x" >> kaldi.mk
 
 # Most of the OS-specific steps below will append to kaldi.mk
 echo "Doing OS specific configurations ..."
 
-if $android ; then
-  OPENFSTLIBS="$FSTROOT/lib/libfst.a"
-  echo "OPENFSTLIBS = $OPENFSTLIBS" >> kaldi.mk
-
-  if [ -z $ANDROIDINC ] ;  then
-    failure "--android-includes must be specified for android builds"
-  fi
-
-  if [ -z $HOST ] ; then
-    failure "HOST must be specified for android builds"
-  fi
-
-  OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
-  if [ -z "$OPENBLASROOT" ]; then
-    failure "Must specify the location of OPENBLAS with --openblas-root option (and it must exist)"
-  fi
-  if [ ! -f $OPENBLASROOT/lib/libopenblas.a ]; then
-    failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.a"
-  fi
-  echo "Your math library seems to be OpenBLAS.  Configuring appropriately."
-
-  OPENBLASLIBS="$OPENBLASROOT/lib/libopenblas.a $OPENBLASROOT/lib/libclapack.a $OPENBLASROOT/lib/liblapack.a $OPENBLASROOT/lib/libblas.a $OPENBLASROOT/lib/libf2c.a"
-  echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
-  echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
-  echo "ANDROIDINC = $ANDROIDINC" >> kaldi.mk
-
-  cat makefiles/android_openblas.mk >> kaldi.mk
-
-  add_cross_tools
-
-  echo "Successfully configured OpenBLAS from $OPENBLASROOT."
-  echo "Configuration succeeded for platform Android"
-  exit_success
-fi
-
-
 # Check for Darwin at first, because we later call uname -o (for Cygwin)
 # which crashes on Darwin. Also the linear algebra libraries on Macs are
 # used differently (through the Accelerate framework) than on Linux.
@@ -1095,26 +936,26 @@ if [ "`uname`" == "Darwin"  ]; then
 fi
 
 if [ "`uname -o`" == "Cygwin"  ]; then
-    echo "On Cygwin: checking for linear algebra libraries ..."
-    if [ ! -f ../tools/CLAPACK/clapack.h ]; then
-        failure "could not find file ../tools/CLAPACK/clapack.h"
-    fi
-    if [ ! -f /usr/lib/lapack/cygblas-0.dll ]; then
-       failure "please first install package liblapack0"
-    fi
-    cat makefiles/cygwin.mk >> kaldi.mk
-    echo "Configuration succeeded for platform cygwin"
-    exit_success;
+  echo "On Cygwin: checking for linear algebra libraries ..."
+  if [ ! -f ../tools/CLAPACK/clapack.h ]; then
+      failure "could not find file ../tools/CLAPACK/clapack.h"
+  fi
+  if [ ! -f /usr/lib/lapack/cygblas-0.dll ]; then
+     failure "please first install package liblapack0"
+  fi
+  cat makefiles/cygwin.mk >> kaldi.mk
+  echo "Configuration succeeded for platform cygwin"
+  exit_success;
 fi
 
 if [ "`uname`" == "Linux" ]; then
   if  $static_fst ; then
-      OPENFSTLIBS="$FSTROOT/lib/libfst.a"
-      fst_type='a'
+    OPENFSTLIBS="$FSTROOT/lib/libfst.a"
+    fst_type='a'
   else
-      OPENFSTLIBS="-L${FSTROOT}/lib -lfst"
-      OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
-      fst_type='so'
+    OPENFSTLIBS="-L${FSTROOT}/lib -lfst"
+    OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
+    fst_type='so'
   fi
   if [ ! -f "$FSTROOT/lib/libfst.${fst_type}" ]; then
     failure "Static=[$static_fst] OpenFST library not found:  See ../tools/INSTALL"
@@ -1169,7 +1010,7 @@ if [ "`uname`" == "Linux" ]; then
     fi
 
   elif [ "$MATHLIB" == "MKL" ]; then
-    if [ "$TARGET_ARCH" != "x86_64" ]; then
+    if [ "`uname -m`" != "x86_64" ]; then
       failure "MKL on Linux only supported for Intel(R) 64 architecture (x86_64).
       See makefiles/linux_64_mkl.mk to manually configure for other platforms."
     fi
@@ -1235,13 +1076,12 @@ if [ "`uname`" == "Linux" ]; then
     if [ ! -f makefiles/linux_clapack.mk ]; then
       failure "makefiles/linux_clapack.mk not found."
     fi
-    if [[ "$TARGET_ARCH" == arm* ]]; then
+    if [[ "`uname -m`" == arm* ]]; then
       cat makefiles/linux_clapack_arm.mk >> kaldi.mk
     else
       cat makefiles/linux_clapack.mk >> kaldi.mk
     fi
     fix_cxx_flag
-    add_cross_tools
     echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work."
     $use_cuda && configure_cuda
     linux_configure_speex
@@ -1265,15 +1105,14 @@ if [ "`uname`" == "Linux" ]; then
     fi
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
     echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
-    if [[ "$TARGET_ARCH" == arm* ]]; then
+    if [[ "`uname -m`" == arm* ]]; then
       cat makefiles/linux_openblas_arm.mk >> kaldi.mk
-    elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+    elif [[ "`uname -m`" == ppc64le ]]; then
       cat makefiles/linux_openblas_ppc64le.mk >> kaldi.mk
     else
       cat makefiles/linux_openblas.mk >> kaldi.mk
     fi
     fix_cxx_flag
-    add_cross_tools
     $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured OpenBLAS from $OPENBLASROOT."

From 3bf3b1559db73d71338a44a8650c411a94ddf39d Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Mon, 5 Dec 2016 14:53:14 -0800
Subject: [PATCH 072/213] Update src/Makefile to enforce OpenFst >= 1.5.3.

OpenFst-1.5.3 adds support for minimization of non-deterministic FSTs
over idempotent semirings which is a feature used throughout Kaldi.
Along with the requirement for a C++ compiler with C++11 support, we
are also removing support for older OpenFst releases so that we can
build against an un-patched OpenFst installation.
---
 src/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 8bc18b254e9..cecc8ca5170 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -97,8 +97,8 @@ endif
 # Note: OPENFST_VER is determined by configure and added to kaldi.mk
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 test_dependencies:
-ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10302)","1")
-	$(error OpenFst $(OPENFST_VER) is not supported. You now need OpenFst >= 1.3.2.)
+ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")
+	$(error OpenFst $(OPENFST_VER) is not supported. You now need OpenFst >= 1.5.3.)
 endif
 
 check_portaudio:
@@ -184,4 +184,3 @@ onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder
 online: decoder gmm transform feat matrix util base lat hmm thread tree
 online2: decoder gmm transform feat matrix util base lat hmm thread tree ivector cudamatrix nnet2 nnet3 chain
 kws: base util thread hmm tree matrix lat
-

From 091da1a02b0ddd1cfbd833b5445cb13776973eb2 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Mon, 5 Dec 2016 17:06:18 -0800
Subject: [PATCH 073/213] Remove code for supporting OpenFst < 1.5.3.

This commit removes all code that has to do with supporting old
OpenFst releases. It also makes some updates to TableMatcher,
ContextFst and TrivialFactorWeight.
---
 src/bin/phones-to-prons.cc                 |   6 -
 src/fstext/context-fst-inl.h               |  49 ++------
 src/fstext/context-fst-test.cc             |  13 --
 src/fstext/context-fst.h                   | 139 +++++----------------
 src/fstext/determinize-lattice-test.cc     |  36 +-----
 src/fstext/determinize-star-test.cc        | 103 +--------------
 src/fstext/factor-test.cc                  |   8 --
 src/fstext/fstext-utils-test.cc            |  14 ---
 src/fstext/kaldi-fst-io-inl.h              |   5 -
 src/fstext/lattice-utils-inl.h             |   4 -
 src/fstext/lattice-utils-test.cc           |  38 ++----
 src/fstext/pre-determinize-test.cc         |  38 +-----
 src/fstext/prune-special-test.cc           |  12 --
 src/fstext/push-special-test.cc            |  12 +-
 src/fstext/remove-eps-local-test.cc        |  19 +--
 src/fstext/table-matcher-test.cc           |  36 +-----
 src/fstext/table-matcher.h                 |  59 ++-------
 src/fstext/trivial-factor-weight-test.cc   |  47 +------
 src/fstext/trivial-factor-weight.h         |  62 ++-------
 src/kwsbin/kws-search.cc                   |   4 -
 src/lat/determinize-lattice-pruned-test.cc |  36 +-----
 src/lat/kaldi-lattice.cc                   |  20 +--
 src/lat/push-lattice-test.cc               |  10 --
 src/latbin/lattice-compose.cc              |   4 -
 src/latbin/lattice-lmrescore.cc            |   4 -
 25 files changed, 104 insertions(+), 674 deletions(-)

diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 33a821ce6ab..6e3cf7a4651 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -170,11 +170,7 @@ int main(int argc, char *argv[]) {
                    << "not reach end-state, or mismatched lexicon.)";
         if (g_kaldi_verbose_level >= 2) {
           KALDI_LOG << "phn2word FST is below:";
-#if OPENFST_VER >= 10400
           fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t");
-#else
-          fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true);
-#endif
           fstprinter.Print(&std::cerr, "standard error");
           KALDI_LOG << "phone sequence is: ";
           for (size_t i = 0; i < phones.size(); i++)
@@ -219,5 +215,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
index 204c8b92c1f..4427863d887 100644
--- a/src/fstext/context-fst-inl.h
+++ b/src/fstext/context-fst-inl.h
@@ -41,13 +41,6 @@ typename ContextFstImpl<Arc, LabelT>::StateId
   VectorToStateIter iter = state_map_.find(seq);
   if (iter == state_map_.end()) {  // Not already in map.
     StateId this_state_id = (StateId)state_seqs_.size();
-    //This check is not needed with OpenFst >= 1.4
-#if OPENFST_VER >= 10400
-#else
-    StateId this_state_id_check = CacheImpl<Arc>::AddState();
-    // goes back to VectorFstBaseImpl<Arc>, inherited via CacheFst<Arc>
-    assert(this_state_id == this_state_id_check);
-#endif
     state_seqs_.push_back(seq);
     state_map_[seq] = this_state_id;
     return this_state_id;
@@ -326,60 +319,34 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
   // We just try adding all possible symbols on the output side.
   Arc arc;
   if (this->CreateArc(s, subsequential_symbol_, &arc)) {
-#if OPENFST_VER >= 10400
     this->PushArc(s, arc);
-#else
-    this->AddArc(s, arc);
-#endif
   }
   for (typename kaldi::ConstIntegerSet<Label>::iterator iter = phone_syms_.begin();
        iter != phone_syms_.end(); ++iter) {
     Label phone = *iter;
     if (this->CreateArc(s, phone, &arc)) {
-#if OPENFST_VER >= 10400
       this->PushArc(s, arc);
-#else
-      this->AddArc(s, arc);
-#endif
     }
   }
   for (typename kaldi::ConstIntegerSet<Label>::iterator iter = disambig_syms_.begin();
        iter != disambig_syms_.end(); ++iter) {
     Label disambig_sym = *iter;
     if (this->CreateArc(s, disambig_sym, &arc)) {
-#if OPENFST_VER >= 10400
       this->PushArc(s, arc);
-#else
-      this->AddArc(s, arc);
-#endif
     }
   }
   this->SetArcs(s);  // mark the arcs as "done". [so HasArcs returns true].
 }
 
 
-template<class Arc, class LabelT>
-ContextFst<Arc, LabelT>::ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset) {
-#if OPENFST_VER >= 10500
-  if (reset) {
-    impl_ = std::make_shared<ContextFstImpl<Arc, LabelT> >(*(fst.impl_));
-  } else {
-    impl_ = fst.impl_;
-  }
-#else
-  if (reset) {
-    impl_ = new ContextFstImpl<Arc, LabelT>(*(fst.impl_));
-    // Copy constructor of ContextFstImpl.
-    // Main use of calling with reset = true is to free up memory
-    // (e.g. then you could delete original one).  Might be useful in transcription
-    // expansion during training.
-  } else {
-    impl_ = fst.impl_;
-    impl_->IncrRefCount();
-  }
-#endif
-}
-
+// template<class Arc, class LabelT>
+// ContextFst<Arc, LabelT>::ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset) {
+//   if (reset) {
+//     impl_ = std::make_shared<ContextFstImpl<Arc, LabelT> >(*(fst.impl_));
+//   } else {
+//     impl_ = fst.impl_;
+//   }
+// }
 
 
 template<class Arc, class LabelT>
diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc
index 72b50da1339..a57b7231728 100644
--- a/src/fstext/context-fst-test.cc
+++ b/src/fstext/context-fst-test.cc
@@ -192,11 +192,7 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
   }
 
   if (verbose) {  // Try to print the fst.
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(cfst, cfst.InputSymbols(), cfst.OutputSymbols(), NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(cfst, cfst.InputSymbols(), cfst.OutputSymbols(), NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -211,11 +207,7 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
     if (verbose) {
       std::cout << "Sequence FST is:\n";
       {  // Try to print the fst.
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(*f, f->InputSymbols(), f->OutputSymbols(), NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(*f, f->InputSymbols(), f->OutputSymbols(), NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
     }
@@ -257,13 +249,8 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
     if (verbose) {
       std::cout << "Composed FST is:\n";
       {  // Try to print the fst.
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(fst_composed, fst_composed.InputSymbols(),
                                    fst_composed.OutputSymbols(), NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(fst_composed, fst_composed.InputSymbols(),
-                                   fst_composed.OutputSymbols(), NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
     }
diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h
index 2d13e944f0a..7a00b7ed2f1 100644
--- a/src/fstext/context-fst.h
+++ b/src/fstext/context-fst.h
@@ -48,16 +48,8 @@
    efficient to compose with.
 */
 
-#ifdef _MSC_VER
 #include <unordered_map>
 using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
 
 #include <algorithm>
 #include <string>
@@ -77,9 +69,9 @@ namespace fst {
 */
 
 
-template < class Arc,
-         class LabelT = int32> // make the vector<Label> things actually vector<int32> for
-                             // easier compatibility with Kaldi code.
+template <class Arc,
+          class LabelT = int32> // make the vector<Label> things actually vector<int32> for
+                                // easier compatibility with Kaldi code.
 class ContextFstImpl : public CacheImpl<Arc> {
  public:
 
@@ -94,10 +86,8 @@ class ContextFstImpl : public CacheImpl<Arc> {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Label Label;
-#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-#endif
   typedef unordered_map<vector<LabelT>,
                         StateId, kaldi::VectorHasher<LabelT> > VectorToStateType;
   typedef unordered_map<vector<LabelT>,
@@ -119,7 +109,7 @@ class ContextFstImpl : public CacheImpl<Arc> {
   // See \ref tree_ilabel
   // "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel" for more
   // information about the ilabel_info.
-  const vector<vector<LabelT> > &ILabelInfo() { return ilabel_info_; }
+  const vector<vector<LabelT> > &ILabelInfo() const { return ilabel_info_; }
 
   StateId Start();
 
@@ -211,124 +201,66 @@ class ContextFstImpl : public CacheImpl<Arc> {
 
 template <class Arc,
           class LabelT = int32> // make the vector<LabelT> things actually vector<int32> for
-                              // easier compatibility with Kaldi code.
-class ContextFst : public Fst<Arc> {
+                                // easier compatibility with Kaldi code.
+class ContextFst : public ImplToFst<ContextFstImpl<Arc, LabelT>> {
  public:
-  friend class ArcIterator< ContextFst<Arc> >;
-  friend class StateIterator< ContextFst<Arc> >;
-#if OPENFST_VER >= 10400
-#else
-  // We have to supply the default template argument below to work around a
-  // Visual Studio bug.
-  friend class CacheArcIterator< ContextFst<Arc>,
-                                 DefaultCacheStateAllocator<CacheState<Arc> > >;
-#endif
+  friend class ArcIterator<ContextFst<Arc>>;
+  friend class StateIterator<ContextFst<Arc>>;
 
   typedef typename Arc::Weight Weight;
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
-#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-#else
-  typedef CacheState<Arc> State;
-#endif
+  typedef ContextFstImpl<Arc, LabelT> Impl;
 
   /// See \ref graph_context for more details.
   ContextFst(Label subsequential_symbol,  // epsilon not allowed.
              const vector<LabelT>& phones,  // symbols on output side of fst.
              const vector<LabelT>& disambig_syms,  // symbols on output side of fst.
              int32 N,  // Size of context window
-             int32 P):  // Pos of "central" phone in ctx window, from 0..N-1.
-      impl_ (new ContextFstImpl<Arc, LabelT>(subsequential_symbol, phones, disambig_syms, N, P))
-  { assert(std::numeric_limits<LabelT>::is_signed); }
-
-  ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset = false);
+             int32 P)  // Pos of "central" phone in ctx window, from 0..N-1.
+      : ImplToFst<Impl>(std::make_shared<Impl>(
+            subsequential_symbol, phones, disambig_syms, N, P)) {
+    assert(std::numeric_limits<LabelT>::is_signed);
+  }
 
-#if OPENFST_VER >= 10500
-#else
-  virtual ~ContextFst() { if (!impl_->DecrRefCount()) delete impl_; }
-#endif
+  ContextFst(const ContextFst<Arc, LabelT> &fst, bool safe = false)
+      : ImplToFst<Impl>(fst, safe) {}
 
-  virtual StateId Start() const { return impl_->Start(); }
+  ContextFst<Arc, LabelT> *Copy(bool safe = false) const override {
+    return new ContextFst<Arc, LabelT>(*this, safe);
+  }
 
-  virtual Weight Final(StateId s) const { return impl_->Final(s); }
+  inline void InitStateIterator(StateIteratorData<Arc> *data) const override;
 
-  StateId NumStates() const { return impl_->NumStates(); }
+  void InitArcIterator(StateId s, ArcIteratorData<Arc> *data) const override {
+    GetMutableImpl()->InitArcIterator(s, data);
+  }
 
   // This function is used in ContextMatcher.
   // Semantically this is not really const, as it causes states to be
   // added to the state table in impl_, and the input vocabulary to be
   // expanded, but C++ lets us make this const, and compose.h
-  // requires it (because it provides const fst's to the Matcher
-  // object.
+  // requires it (because it provides const fst's to the Matcher object.
   bool CreateArc(StateId s, Label olabel, Arc *oarc) const {
-    return impl_->CreateArc(s, olabel, oarc);
-  }
-
-  size_t NumArcs(StateId s) const { return impl_->NumArcs(s); }
-
-  size_t NumInputEpsilons(StateId s) const {
-    return impl_->NumInputEpsilons(s);
-  }
-
-  size_t NumOutputEpsilons(StateId s) const {
-    return impl_->NumOutputEpsilons(s);
-  }
-
-  virtual uint64 Properties(uint64 mask, bool test) const {
-    if (test) {
-      uint64 knownprops, testprops = TestProperties(*this, mask, &knownprops);
-      impl_->SetProperties(knownprops, testprops);
-      return testprops & mask;
-    } else {
-      return impl_->Properties(mask);
-    }
+    return GetMutableImpl()->CreateArc(s, olabel, oarc);
   }
 
   // Careful: the output of ILabelInfo depends on what has been visited.
-  const vector<vector<LabelT> > &ILabelInfo() { return impl_->ILabelInfo(); }
-
-  virtual const string& Type() const { return impl_->Type(); }
-
-  virtual ContextFst<Arc>  *Copy(bool reset = false) const {
-    return new ContextFst<Arc>(*this, reset);
+  const vector<vector<LabelT> > &ILabelInfo() const {
+    return GetImpl()->ILabelInfo();
   }
 
-  virtual const SymbolTable* InputSymbols() const {
-    return impl_->InputSymbols();
-  }
-
-  virtual const SymbolTable* OutputSymbols() const {
-    return impl_->OutputSymbols();
-  }
-
-  virtual inline void InitStateIterator(StateIteratorData<Arc> *data) const;
-
-  virtual void InitArcIterator(StateId s, ArcIteratorData<Arc> *data) const {
-    impl_->InitArcIterator(s, data);
-  }
-
-  friend class CacheStateIterator<ContextFst<Arc> >;  // so it can see impl_.
  private:
-#if OPENFST_VER >= 10500
-  std::shared_ptr<ContextFstImpl<Arc, LabelT> > impl_;  // protected so CacheStateIterator
-  ContextFstImpl<Arc, LabelT> *GetImpl() const { return impl_.get(); }
-#else
-  ContextFstImpl<Arc, LabelT> *impl_;  // protected so CacheStateIterator
-  // Makes visible to friends.
-  ContextFstImpl<Arc, LabelT> *GetImpl() const { return impl_; }
-  // would be: ImplToFst<ContextFstImpl<Arc, LabelT> >::GetImpl();
-  // but need to convert to using the ImplToFst stuff.
-
-  void operator = (const ContextFstImpl<Arc> &fst);  // disallow
-#endif
+  using ImplToFst<Impl>::GetImpl;
+  using ImplToFst<Impl>::GetMutableImpl;
 
+  ContextFst &operator=(const ContextFst &fst) = delete;
 };
 
 /// Useful utility function for writing these vectors to disk.
-/// writes as int32 for binary compatibility since I will typically
-/// be "int".
+/// writes as int32 for binary compatibility since it will typically be "int".
 template<class I>
 void WriteILabelInfo(std::ostream &os, bool binary,
                      const vector<vector<I> > &info);
@@ -356,7 +288,7 @@ class StateIterator< ContextFst<A> >
     : public CacheStateIterator< ContextFst<A> > {
  public:
   explicit StateIterator(const ContextFst<A> &fst)
-    : CacheStateIterator< ContextFst<A> >(fst, fst.GetImpl()) {}
+    : CacheStateIterator< ContextFst<A> >(fst, fst.GetMutableImpl()) {}
 };
 
 
@@ -369,13 +301,10 @@ class ArcIterator< ContextFst<A> >
   typedef typename A::StateId StateId;
 
   ArcIterator(const ContextFst<A> &fst, StateId s)
-    : CacheArcIterator< ContextFst<A> >(fst.GetImpl(), s) {
+    : CacheArcIterator< ContextFst<A> >(fst.GetMutableImpl(), s) {
     if (!fst.GetImpl()->HasArcs(s)) // arcs not already computed.
-      fst.GetImpl()->Expand(s);
+      fst.GetMutableImpl()->Expand(s);
   }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(ArcIterator);
 };
 
 template <class A, class I> inline
diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc
index 42122c6e193..7359fa1354d 100644
--- a/src/fstext/determinize-lattice-test.cc
+++ b/src/fstext/determinize-lattice-test.cc
@@ -75,7 +75,7 @@ template<class Arc> void TestDeterminizeLattice() {
   typedef typename Arc::Weight Weight;
   typedef int32 Int;
   typedef ArcTpl<CompactLatticeWeightTpl<Weight, Int> > CompactArc;
-  
+
   for(int i = 0; i < 100; i++) {
     RandFstOptions opts;
     opts.n_states = 4;
@@ -84,34 +84,26 @@ template<class Arc> void TestDeterminizeLattice() {
     opts.allow_empty = false;
     opts.weight_multiplier = 0.5; // impt for the randomly generated weights
     // to be exactly representable in float,
-    // or this test fails because numerical differences can cause symmetry in 
+    // or this test fails because numerical differences can cause symmetry in
     // weights to be broken, which causes the wrong path to be chosen as far
     // as the string part is concerned.
-    
+
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
       DeterminizeLatticeOptions lat_opts;
       lat_opts.max_mem = 100;
-      
+
       if (!DeterminizeLattice<TropicalWeight, int32>(*fst, &det_fst, lat_opts, NULL))
         throw std::runtime_error("could not determinize");
       std::cout << "FST after lattice-determinizing is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       assert(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
@@ -122,11 +114,7 @@ template<class Arc> void TestDeterminizeLattice() {
       ConvertLattice<Weight, Int>(*fst, &compact_fst, false);
       std::cout << "Compact FST is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       if (kaldi::Rand() % 2 == 1)
@@ -134,17 +122,13 @@ template<class Arc> void TestDeterminizeLattice() {
       else
         if (!DeterminizeLattice<TropicalWeight, int32>(*fst, &compact_det_fst, lat_opts, NULL))
           throw std::runtime_error("could not determinize");
-      
+
       std::cout << "Compact version of determinized FST is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
-      
+
       assert(RandEquivalent(compact_det_fst, compact_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
     } catch (...) {
       std::cout << "Failed to lattice-determinize this FST (probably not determinizable)\n";
@@ -162,22 +146,14 @@ template<class Arc> void TestDeterminizeLattice2() {
     VectorFst<Arc> *fst = RandFst<Arc>(opts);
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLattice<TropicalWeight, int32>(*fst, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     delete fst;
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index f308d8460d8..ee150f0c024 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -37,11 +37,7 @@ template<class Arc> void TestDeterminizeGeneral() {
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before determinizing is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
@@ -49,14 +45,10 @@ template<class Arc> void TestDeterminizeGeneral() {
       DeterminizeStar<Fst<Arc> >(*fst, &ofst, kDelta, NULL, max_states);
       std::cout << "FST after determinizing is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
-      assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));      
+      assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
     } catch (...) {
       std::cout << "Failed to determinize *this FST (probably not determinizable)\n";
     }
@@ -108,11 +100,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing before trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -120,11 +108,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -137,11 +121,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -157,11 +137,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   VectorFst<Arc> ofst_orig;
@@ -180,22 +156,14 @@ template<class Arc>  void TestDeterminize() {
 
   {
     std::cout <<" printing after determinization [baseline]\n";
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     assert(ofst_orig.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
   {
     std::cout <<" printing after determinization [star]\n";
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     assert(ofst_star.Properties(kIDeterministic, true) == kIDeterministic);
   }
@@ -205,11 +173,7 @@ template<class Arc>  void TestDeterminize() {
   int64 num_removed = DeleteISymbols(&ofst_star, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -277,11 +241,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing before trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -289,11 +249,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -310,11 +266,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing after pushing\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -367,11 +319,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing before trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -379,11 +327,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -396,11 +340,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -416,11 +356,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   VectorFst<Arc> ofst_orig;
@@ -433,11 +369,7 @@ template<class Arc>  void TestMinimize() {
   }
   {
     std::cout <<" printing after determinization [baseline]\n";
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -449,11 +381,7 @@ template<class Arc>  void TestMinimize() {
     DeterminizeStar(*fst, &gallic_fst);
     {
       std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n";
-#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -463,11 +391,7 @@ template<class Arc>  void TestMinimize() {
 
     {
       std::cout <<" printing after pushing weights [in gallic]\n";
-#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -476,45 +400,24 @@ template<class Arc>  void TestMinimize() {
     Minimize(&gallic_fst);
     {
       std::cout <<" printing after  minimization [in gallic]\n";
-#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
     printf("Converting gallic back to regular [my approach]\n");
-#if OPENFST_VER >= 10400
     TrivialFactorWeightFst< GallicArc<Arc, GALLIC_LEFT>, GallicFactor<typename Arc::Label,
         typename Arc::Weight, GALLIC_LEFT> > fwfst(gallic_fst);
-#else
-    TrivialFactorWeightFst< GallicArc<Arc, STRING_LEFT>, GallicFactor<typename Arc::Label,
-        typename Arc::Weight, STRING_LEFT> > fwfst(gallic_fst);
-#endif
     {
       std::cout <<" printing factor-weight FST\n";
-#if OPENFST_VER >= 10400
       FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
-#if OPENFST_VER >= 10400
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
-#else
-    Map(fwfst, &ofst_star, FromGallicMapper<Arc, STRING_LEFT>());
-#endif
 
     {
       std::cout <<" printing after converting back to regular FST\n";
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -527,11 +430,7 @@ template<class Arc>  void TestMinimize() {
   int64 num_removed = DeleteISymbols(&ofst_star, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc
index 1d446796b05..cb021ab4643 100644
--- a/src/fstext/factor-test.cc
+++ b/src/fstext/factor-test.cc
@@ -78,11 +78,7 @@ template<class Arc> static void TestFactor() {
 
   std::cout <<" printing before trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -90,11 +86,7 @@ template<class Arc> static void TestFactor() {
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 494935d3622..2802a84cca6 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -146,11 +146,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
 
   std::cout <<" printing before trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -158,11 +154,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -376,11 +368,7 @@ void TestEqualAlign() {
 
 template<class Arc> void Print(const Fst<Arc> &fst, std::string message) {
   std::cout << message << "\n";
-#if OPENFST_VER >= 10400
   FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true, "\t");
-#else
-  FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true);
-#endif
   fstprinter.Print(&std::cout, "standard output");
 }
 
@@ -451,5 +439,3 @@ int main() {
     fst::TestRemoveUselessArcs<fst::StdArc>();
   }
 }
-
-
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index 58895449c72..b6bae4b9dc9 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -42,13 +42,8 @@ void WriteFstKaldi(std::ostream &os, bool binary,
     // appear on its own line.
     os << '\n';
     bool acceptor = false, write_one = false;
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
                             NULL, acceptor, write_one, "\t");
-#else
-    FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
-                            NULL, acceptor, write_one);
-#endif
     printer.Print(&os, "<unknown>");
     if (os.fail())
       KALDI_ERR << "Stream failure detected writing FST to stream";
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index f15e8d2cc57..5bb40e3efa3 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -267,12 +267,8 @@ void ConvertFstToLattice(
     const ExpandedFst<ArcTpl<TropicalWeight> > &ifst,
     MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst) {
   int32 num_states_cache = 50000;
-#if OPENFST_VER >= 10500
   fst::CacheOptions cache_opts(true, num_states_cache);
   fst::MapFstOptions mapfst_opts(cache_opts);
-#else
-  fst::CacheOptions mapfst_opts(true, num_states_cache);
-#endif
   StdToLatticeMapper<Real> mapper;
   MapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
          StdToLatticeMapper<Real> > map_fst(ifst, mapper, mapfst_opts);
diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc
index 51df0ce8364..e74caef4aa2 100644
--- a/src/fstext/lattice-utils-test.cc
+++ b/src/fstext/lattice-utils-test.cc
@@ -30,11 +30,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before converting to compact-arc is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<CompactArc> ofst;
@@ -42,25 +38,17 @@ template<class Weight, class Int> void TestConvert(bool invert) {
 
     std::cout << "FST after converting is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     std::cout << "FST after back conversion is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
-    
+
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
     delete fst;
   }
@@ -78,11 +66,7 @@ template<class Weight, class Int> void TestShortestPath() {
       std::cout << "Testing shortest path\n";
       std::cout << "FST before converting to compact-arc is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       VectorFst<CompactArc> cfst;
@@ -96,8 +80,8 @@ template<class Weight, class Int> void TestShortestPath() {
         ShortestPath(*fst, &nbest_fst_2, 3);
         VectorFst<Arc> nbest_fst_1b;
         ShortestPath(nbest_fst_2, &nbest_fst_1b, 1);
-      
-      
+
+
         assert(ApproxEqual(ShortestDistance(nbest_fst_1),
                            ShortestDistance(nbest_fst_1b)));
 
@@ -112,7 +96,7 @@ template<class Weight, class Int> void TestShortestPath() {
         ShortestPath(cfst, &nbest_fst_2, 3);
         VectorFst<CompactArc> nbest_fst_1b;
         ShortestPath(nbest_fst_2, &nbest_fst_1b, 1);
-      
+
         assert(ApproxEqual(ShortestDistance(nbest_fst_1),
                            ShortestDistance(nbest_fst_1b)));
         // since semiring is idempotent, this should succeed too.
@@ -122,7 +106,7 @@ template<class Weight, class Int> void TestShortestPath() {
 
       delete fst;
     }
-  }  
+  }
 }
 
 
@@ -132,7 +116,7 @@ template<class Int> void TestConvert2() {
   typedef ArcTpl<LatticeWeightTpl<double> > ArcD;
   typedef ArcTpl<CompactLatticeWeightTpl<LatticeWeightTpl<float>, Int> > CArcF;
   typedef ArcTpl<CompactLatticeWeightTpl<LatticeWeightTpl<double>, Int> > CArcD;
-  
+
   for(int i = 0; i < 2; i++) {
     {
       VectorFst<ArcF> *fst1 = RandPairFst<ArcF>();
@@ -197,7 +181,7 @@ template<class Int> void TestConvert2() {
       assert(RandEquivalent(*fst1, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
       delete fst1;
     }
-    
+
     {
       VectorFst<ArcD> *fst1 = RandPairFst<ArcD>();
       VectorFst<CArcF> cfst1;
@@ -209,7 +193,7 @@ template<class Int> void TestConvert2() {
     }
   }
 }
-    
+
 
 // use TestConvertPair when the Weight can be constructed from
 // a pair of floats.
@@ -239,7 +223,7 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
       fstprinter.Print(&std::cout, "standard output");
       }*/
 
-    assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));    
+    assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
     delete fst;
   }
 }
@@ -268,7 +252,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     scale2[1][0] = -0.25;
   }
 
-  
+
   typedef ArcTpl<Weight> Arc;
   typedef ArcTpl<CompactLatticeWeightTpl<Weight, Int> > CompactArc;
   for(int i = 0; i < 2; i++) {
@@ -331,7 +315,7 @@ int main() {
   }
   {
     typedef LatticeWeightTpl<double> LatticeWeight;
-    TestShortestPath<LatticeWeight, int32>();    
+    TestShortestPath<LatticeWeight, int32>();
     TestConvert2<int32>();
     for(int i = 0; i < 2; i++) {
       bool invert = (i % 2);
diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc
index 774507b0792..bea8120e0e5 100644
--- a/src/fstext/pre-determinize-test.cc
+++ b/src/fstext/pre-determinize-test.cc
@@ -36,12 +36,12 @@ template<class Arc>  void TestPreDeterminize() {
   int n_syms = 2 + kaldi::Rand() % 5, n_states = 3 + kaldi::Rand() % 10, n_arcs = 5 + kaldi::Rand() % 30, n_final = 1 + kaldi::Rand()%3;  // Up to 2 unique symbols.
   cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
   SymbolTable *sptr = NULL;
-  
+
   vector<Label> all_syms;  // including epsilon.
   // Put symbols in the symbol table from 1..n_syms-1.
   for (size_t i = 0;i < (size_t)n_syms;i++)
     all_syms.push_back(i);
-  
+
   // Create states.
   vector<StateId> all_states;
   for (size_t i = 0;i < (size_t)n_states;i++) {
@@ -69,11 +69,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing before trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -81,11 +77,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -99,11 +91,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -119,11 +107,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -133,22 +117,14 @@ template<class Arc>  void TestPreDeterminize() {
   Determinize(*fst, &ofst, opts);
   std::cout <<" printing after determinization\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
   int64 num_removed = DeleteISymbols(&ofst, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -200,11 +176,7 @@ template<class Arc>  void TestAddSelfLoops() {
   }
   std::cout <<" printing before adding self-loops\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -223,11 +195,7 @@ template<class Arc>  void TestAddSelfLoops() {
 
   std::cout <<" printing after adding self-loops\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -247,5 +215,3 @@ int main() {
     fst::TestAddSelfLoops<fst::StdArc>();
   }
 }
-
-
diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc
index cb55edca6cc..2da002d980e 100644
--- a/src/fstext/prune-special-test.cc
+++ b/src/fstext/prune-special-test.cc
@@ -37,11 +37,7 @@ static void TestPruneSpecial() {
   float beam = 0.55;
 
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     std::cout << endl;
   }
@@ -50,11 +46,7 @@ static void TestPruneSpecial() {
   VectorFst<Arc> ofst1;
   PruneSpecial<StdArc>(*ifst, &ofst1, beam);
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     std::cout << endl;
   }
@@ -63,11 +55,7 @@ static void TestPruneSpecial() {
   VectorFst<Arc> ofst2;
   Prune(*ifst, &ofst2, beam);
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     std::cout << endl;
   }
diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc
index 0106492e887..557b43d3062 100644
--- a/src/fstext/push-special-test.cc
+++ b/src/fstext/push-special-test.cc
@@ -37,14 +37,10 @@ static void TestPushSpecial() {
   VectorFst<Arc> *fst = RandFst<StdArc>();
 
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
-  
+
   VectorFst<Arc> fst_copy(*fst);
 
   float delta = kDelta;
@@ -59,11 +55,7 @@ static void TestPushSpecial() {
 
 
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   KALDI_LOG << "Min value is " << min.Value() << ", max value is " << max.Value();
@@ -71,7 +63,7 @@ static void TestPushSpecial() {
   // below, should be <= delta but different pieces of code compute this in this
   // part vs. push-special, so the roundoff may be different.
   KALDI_ASSERT(std::abs(min.Value() - max.Value()) <=  1.2 * delta);
-  
+
   KALDI_ASSERT(RandEquivalent(*fst, fst_copy,
                               5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
   delete fst;
diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc
index 2c6c6f8d97f..af8b890cca8 100644
--- a/src/fstext/remove-eps-local-test.cc
+++ b/src/fstext/remove-eps-local-test.cc
@@ -82,11 +82,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -99,11 +95,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
 
   {
     std::cout << "copy1 = \n";
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -126,7 +118,7 @@ static void TestRemoveEpsLocalSpecial() {
   typedef LogArc::StateId StateId;
   typedef LogArc Arc;
   VectorFst<LogArc> *logfst = RandFst<LogArc>();
- 
+
   { // Make the FST stochastic.
     for (StateId s = 0; s < logfst->NumStates(); s++) {
       Weight w = logfst->Final(s);
@@ -148,11 +140,7 @@ static void TestRemoveEpsLocalSpecial() {
 #endif
   {
     std::cout << "logfst = \n";
-#if OPENFST_VER >= 10400
     FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -167,11 +155,7 @@ static void TestRemoveEpsLocalSpecial() {
 
   {
     std::cout << "logfst2 = \n";
-#if OPENFST_VER >= 10400
     FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   if (ApproxEqual(ShortestDistance(*logfst), ShortestDistance(logfst2))) {
@@ -192,4 +176,3 @@ int main() {
     TestRemoveEpsLocalSpecial();
   }
 }
-
diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc
index 0124fff4147..2d39fe957dd 100644
--- a/src/fstext/table-matcher-test.cc
+++ b/src/fstext/table-matcher-test.cc
@@ -63,21 +63,13 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
 
   std::cout <<"Table-Composed FST\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
   std::cout <<" Baseline-Composed FST\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -86,11 +78,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     Difference(composed, composed_baseline, &diff1);
     std::cout <<" Diff1 (composed - baseline) \n";
     {
-#if OPENFST_VER >= 10400
-    FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
-#endif
+      FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -99,11 +87,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     Difference(composed_baseline, composed, &diff2);
     std::cout <<" Diff2 (baseline - composed) \n";
     {
-#if OPENFST_VER >= 10400
-    FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
-#endif
+      FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -164,11 +148,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       Difference(composed, composed_baseline, &diff1);
       std::cout <<" Diff1 (composed - baseline) \n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -177,11 +157,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       Difference(composed_baseline, composed, &diff2);
       std::cout <<" Diff2 (baseline - composed) \n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -242,11 +218,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       Difference(composed, composed_baseline, &diff1);
       std::cout <<" Diff1 (composed - baseline) \n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -255,11 +227,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       Difference(composed_baseline, composed, &diff2);
       std::cout <<" Diff2 (baseline - composed) \n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index 1a1b35d8c68..792fe98fe83 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -86,10 +86,6 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual const FST &GetFst() const { return *fst_; }
 
   virtual ~TableMatcherImpl() {
-#if OPENFST_VER >= 10500
-#else
-    assert(RefCount() == 0);
-#endif
     vector<ArcId> *const empty = ((vector<ArcId>*)(NULL)) + 1;  // special marker.
     for (size_t i = 0; i < tables_.size(); i++) {
       if (tables_[i] != NULL && tables_[i] != empty)
@@ -222,26 +218,7 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return props; } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
 
-#if OPENFST_VER >= 10500
-#else
-  int RefCount() const {
-    return ref_count_.count();
-  }
-
-  int IncrRefCount() {
-    return ref_count_.Incr();
-  }
-
-  int DecrRefCount() {
-    return ref_count_.Decr();
-  }
-#endif
  private:
-#if OPENFST_VER >= 10500
-#else
-  RefCounter ref_count_;        // Reference count
-#endif
-
   virtual void SetState_(StateId s) { SetState(s); }
   virtual bool Find_(Label label) { return Find(label); }
   virtual bool Done_() const { return Done(); }
@@ -272,29 +249,22 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
   typedef StateId ArcId;  // Use this type to store arc offsets [it's actually size_t
   // in the Seek function of ArcIterator, but StateId should be big enough].
   typedef typename Arc::Weight Weight;
-  typedef TableMatcherImpl<F, BackoffMatcher> I;
+  typedef TableMatcherImpl<F, BackoffMatcher> Impl;
 
   TableMatcher(const FST &fst, MatchType match_type,
-               const TableMatcherOptions &opts = TableMatcherOptions()):
-      impl_(new I(fst, match_type, opts)) { }
-
-  TableMatcher(const TableMatcher<FST, BackoffMatcher> &matcher, bool safe):
-      impl_(matcher.impl_) {
-#if OPENFST_VER >= 10500
-#else
-      impl_->IncrRefCount();
-#endif
+               const TableMatcherOptions &opts = TableMatcherOptions())
+      : impl_(std::make_shared<Impl>(fst, match_type, opts)) { }
+
+  TableMatcher(const TableMatcher<FST, BackoffMatcher> &matcher,
+               bool safe = false)
+      : impl_(matcher.impl_) {
+    if (safe == true) {
+      KALDI_ERR << "TableMatcher: Safe copy not supported";
+    }
   }
 
   virtual const FST &GetFst() const { return impl_->GetFst(); }
 
-#if OPENFST_VER >= 10500
-#else
-  virtual ~TableMatcher() {
-    if (!impl_->DecrRefCount())   delete impl_;
-  }
-#endif
-
   virtual MatchType Type(bool test) const { return impl_->Type(test);  }
 
   void SetState(StateId s) { return impl_->SetState(s); }
@@ -316,18 +286,15 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return impl_->Properties(props); } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
  private:
-#if OPENFST_VER >= 10500
-  std::shared_ptr<I> impl_;
-#else
-  I *impl_;
-#endif
+  std::shared_ptr<Impl> impl_;
 
   virtual void SetState_(StateId s) { impl_->SetState(s); }
   virtual bool Find_(Label label) { return impl_->Find(label); }
   virtual bool Done_() const { return impl_->Done(); }
   virtual const Arc& Value_() const { return impl_->Value(); }
   virtual void Next_() { impl_->Next(); }
-  DISALLOW_COPY_AND_ASSIGN(TableMatcher);
+
+  TableMatcher &operator=(const TableMatcher &) = delete;
 };
 
 struct TableComposeOptions: public TableMatcherOptions {
diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc
index fcf34b6834e..46b6aaf46fb 100644
--- a/src/fstext/trivial-factor-weight-test.cc
+++ b/src/fstext/trivial-factor-weight-test.cc
@@ -70,11 +70,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing before trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -82,11 +78,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after trimming\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -97,11 +89,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -117,15 +105,11 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after double-epsilon removal\n";
   {
-#if OPENFST_VER >= 10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   VectorFst<Arc> ofst_star;
-  
+
   {
     printf("Converting to Gallic semiring");
     VectorFst<GallicArc<Arc> > gallic_fst;
@@ -140,58 +124,33 @@ template<class Arc>  void TestFactor() {
 
     {
       std::cout <<" printing gallic FST\n";
-#if OPENFST_VER >= 10400
       FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
 
     // Map(ofst_star, &gallic_fst, ToGallicMapper<Arc, STRING_LEFT>());
-    
+
     printf("Converting gallic back to regular\n");
-#if OPENFST_VER >= 10400
     TrivialFactorWeightFst< GallicArc<Arc, GALLIC_LEFT>, GallicFactor<typename Arc::Label,
         typename Arc::Weight, GALLIC_LEFT> > fwfst(gallic_fst);
-#else
-    TrivialFactorWeightFst< GallicArc<Arc, STRING_LEFT>, GallicFactor<typename Arc::Label,
-        typename Arc::Weight, STRING_LEFT> > fwfst(gallic_fst);
-#endif
     {
       std::cout <<" printing factor-weight FST\n";
-#if OPENFST_VER >= 10400
       FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
-#if OPENFST_VER >= 10400
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
-#else
-    Map(fwfst, &ofst_star, FromGallicMapper<Arc, STRING_LEFT>());
-#endif
 
     {
       std::cout <<" printing after converting back to regular FST\n";
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
 
     VectorFst<GallicArc<Arc> > new_gallic_fst;
-#if OPENFST_VER >= 10400
     Map(ofst_star, &new_gallic_fst, ToGallicMapper<Arc, GALLIC_LEFT>());
-#else
-    Map(ofst_star, &new_gallic_fst, ToGallicMapper<Arc, STRING_LEFT>());
-#endif
 
     assert(RandEquivalent(gallic_fst, new_gallic_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
 
@@ -251,5 +210,3 @@ int main() {
     fst::TestFactor<fst::StdArc>();
   }
 }
-
-
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index 3e42dd287db..f17ba4e2187 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -52,17 +52,8 @@
 // This has the advantage that it always works, for any input (also I just
 // prefer this approach).
 
-#ifdef _MSC_VER
 #include <unordered_map>
 using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
-
 
 #include <algorithm>
 #include <string>
@@ -117,10 +108,8 @@ class TrivialFactorWeightFstImpl
   typedef typename A::StateId StateId;
   typedef F FactorIterator;
 
-#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<A> Store;
   typedef typename Store::State State;
-#endif
 
   struct Element {
     Element() {}
@@ -157,10 +146,6 @@ class TrivialFactorWeightFstImpl
     SetOutputSymbols(impl.OutputSymbols());
   }
 
-  ~TrivialFactorWeightFstImpl() {
-    delete fst_;
-  }
-
   StateId Start() {
     if (!HasStart()) {
       StateId s = fst_->Start();
@@ -307,7 +292,7 @@ class TrivialFactorWeightFstImpl
 
   typedef unordered_map<Element, StateId, ElementKey, ElementEqual> ElementMap;
 
-  const Fst<A> *fst_;
+  std::unique_ptr<const Fst<A>> fst_;
   float delta_;
   uint32 mode_;               // factoring arc and/or final weights
   Label extra_ilabel_;        // ilabel of arc created when factoring final w's
@@ -315,11 +300,10 @@ class TrivialFactorWeightFstImpl
   vector<Element> elements_;  // mapping Fst state to Elements
   ElementMap element_map_;    // mapping Elements to Fst state
 
-  void operator = (const TrivialFactorWeightFstImpl<A, F> &);  // disallow
 };
 
 
-/// FactorWeightFst takes as template parameter a FactorIterator as
+/// TrivialFactorWeightFst takes as template parameter a FactorIterator as
 /// defined above. The result of weight factoring is a transducer
 /// equivalent to the input whose path weights have been factored
 /// according to the FactorIterator. States and transitions will be
@@ -344,54 +328,36 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
   typedef A Arc;
   typedef typename A::Weight Weight;
   typedef typename A::StateId StateId;
-#if OPENFST_VER >= 10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-#else
-  typedef CacheState<A> State;
-#endif
   typedef TrivialFactorWeightFstImpl<A, F> Impl;
 
-  TrivialFactorWeightFst(const Fst<A> &fst)
-#if OPENFST_VER >= 10500
+  explicit TrivialFactorWeightFst(const Fst<A> &fst)
       : ImplToFst<Impl>(std::make_shared<Impl>(fst, TrivialFactorWeightOptions<A>())) {}
-#else
-      : ImplToFst<Impl>(new Impl(fst, TrivialFactorWeightOptions<A>())) {}
-#endif
 
   TrivialFactorWeightFst(const Fst<A> &fst,  const TrivialFactorWeightOptions<A> &opts)
-#if OPENFST_VER >= 10500
       : ImplToFst<Impl>(std::make_shared<Impl>(fst, opts)) {}
-#else
-      : ImplToFst<Impl>(new Impl(fst, opts)) {}
-#endif
 
   // See Fst<>::Copy() for doc.
   TrivialFactorWeightFst(const TrivialFactorWeightFst<A, F> &fst, bool copy)
       : ImplToFst<Impl>(fst, copy) {}
 
   // Get a copy of this TrivialFactorWeightFst. See Fst<>::Copy() for further doc.
-  virtual TrivialFactorWeightFst<A, F> *Copy(bool copy = false) const {
+  TrivialFactorWeightFst<A, F> *Copy(bool copy = false) const override {
     return new TrivialFactorWeightFst<A, F>(*this, copy);
   }
 
-  virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
+  inline void InitStateIterator(StateIteratorData<A> *data) const override;
 
-  virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
+  void InitArcIterator(StateId s, ArcIteratorData<A> *data) const override {
     GetMutableImpl()->InitArcIterator(s, data);
   }
 
  private:
-  // Makes visible to friends.
-#if OPENFST_VER >= 10500
   using ImplToFst<Impl>::GetImpl;
   using ImplToFst<Impl>::GetMutableImpl;
-#else
-  const Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
-  Impl *GetMutableImpl() const { return ImplToFst<Impl>::GetImpl(); }
-#endif
 
-  void operator=(const TrivialFactorWeightFst<A, F> &fst);  // Disallow
+  TrivialFactorWeightFst &operator=(const TrivialFactorWeightFst &fst) = delete;
 };
 
 
@@ -413,18 +379,14 @@ class ArcIterator< TrivialFactorWeightFst<A, F> >
   typedef typename A::StateId StateId;
 
   ArcIterator(const TrivialFactorWeightFst<A, F> &fst, StateId s)
-      : CacheArcIterator< TrivialFactorWeightFst<A, F> >(fst.GetMutableImpl(), s) {
-    if (!fst.GetImpl()->HasArcs(s))
-      fst.GetMutableImpl()->Expand(s);
+      : CacheArcIterator< TrivialFactorWeightFst<A, F>>(fst.GetMutableImpl(), s) {
+    if (!fst.GetImpl()->HasArcs(s)) fst.GetMutableImpl()->Expand(s);
   }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(ArcIterator);
 };
 
-template <class A, class F> inline
-void TrivialFactorWeightFst<A, F>::InitStateIterator(StateIteratorData<A> *data) const
-{
+template <class A, class F>
+inline void TrivialFactorWeightFst<A, F>::InitStateIterator(
+    StateIteratorData<A> *data) const {
   data->base = new StateIterator< TrivialFactorWeightFst<A, F> >(*this);
 }
 
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index 6f1a5d763d4..1ef2655c656 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -316,11 +316,7 @@ int main(int argc, char *argv[]) {
       }
 
       Project(&result_fst, PROJECT_OUTPUT);
-#if OPENFST_VER >= 10500
       Minimize(&result_fst, (KwsLexicographicFst *) nullptr, kDelta, true);
-#else
-      Minimize(&result_fst);
-#endif
       ShortestPath(result_fst, &result_fst, n_best);
       RmEpsilon(&result_fst);
 
diff --git a/src/lat/determinize-lattice-pruned-test.cc b/src/lat/determinize-lattice-pruned-test.cc
index c932e3c95de..f6684f0b5b5 100644
--- a/src/lat/determinize-lattice-pruned-test.cc
+++ b/src/lat/determinize-lattice-pruned-test.cc
@@ -37,7 +37,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
   typedef kaldi::int32 Int;
   typedef typename Arc::Weight Weight;
   typedef ArcTpl<CompactLatticeWeightTpl<Weight, Int> > CompactArc;
-  
+
   for(int i = 0; i < 100; i++) {
     RandFstOptions opts;
     opts.n_states = 4;
@@ -47,10 +47,10 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     opts.weight_multiplier = 0.5; // impt for the randomly generated weights
     opts.acyclic = true;
     // to be exactly representable in float,
-    // or this test fails because numerical differences can cause symmetry in 
+    // or this test fails because numerical differences can cause symmetry in
     // weights to be broken, which causes the wrong path to be chosen as far
     // as the string part is concerned.
-    
+
     VectorFst<Arc> *fst = RandPairFst<Arc>(opts);
 
     bool sorted = TopSort(fst);
@@ -59,14 +59,10 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     ILabelCompare<Arc> ilabel_comp;
     if (kaldi::Rand() % 2 == 0)
       ArcSort(fst, ilabel_comp);
-    
+
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> det_fst;
@@ -79,11 +75,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
 
       std::cout << "FST after lattice-determinizing is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       KALDI_ASSERT(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
@@ -95,27 +87,19 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       VectorFst<Arc> pruned_fst(*fst);
       if (pruned_fst.NumStates() != 0)
         kaldi::PruneLattice(10.0, &pruned_fst);
-      
+
       VectorFst<CompactArc> compact_pruned_fst, compact_pruned_det_fst;
       ConvertLattice<Weight, Int>(pruned_fst, &compact_pruned_fst, false);
       std::cout << "Compact pruned FST is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       ConvertLattice<Weight, Int>(det_fst, &compact_pruned_det_fst, false);
-      
+
       std::cout << "Compact version of determinized FST is:\n";
       {
-#if OPENFST_VER >= 10400
         FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -138,22 +122,14 @@ template<class Arc> void TestDeterminizeLatticePruned2() {
     VectorFst<Arc> *fst = RandPairFst<Arc>(opts);
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLatticePruned<Weight>(*fst, 10.0, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
-#if OPENFST_VER >= 10400
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     delete fst;
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index b44b12a5a23..744cc538462 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -75,15 +75,9 @@ bool WriteCompactLattice(std::ostream &os, bool binary,
     // on its own line.
     os << '\n';
     bool acceptor = true, write_one = false;
-#if OPENFST_VER >= 10400
     fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
                                                t.OutputSymbols(),
                                                NULL, acceptor, write_one, "\t");
-#else
-    fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
-                                               t.OutputSymbols(),
-                                               NULL, acceptor, write_one);
-#endif
     printer.Print(&os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
@@ -131,7 +125,7 @@ class LatticeReader {
       if (col.size() > 5) {
         KALDI_WARN << "Reading lattice: bad line in FST: " << line;
         delete fst;
-        delete cfst;    
+        delete cfst;
         return PairT(static_cast<Lattice*>(NULL),
                      static_cast<CompactLattice*>(NULL));
       }
@@ -168,7 +162,7 @@ class LatticeReader {
             else fst->SetFinal(s, w);
             break;
           case 3: // 3 columns not ok for Lattice format; it's not an acceptor.
-            ok = false; 
+            ok = false;
             break;
           case 4:
             ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
@@ -253,7 +247,7 @@ class LatticeReader {
           SplitStringToVector(line, separator.c_str(), true, &col);
           if (col.empty()) break;
         }
-        return PairT(static_cast<Lattice*>(NULL), 
+        return PairT(static_cast<Lattice*>(NULL),
                      static_cast<CompactLattice*>(NULL));
       }
     }
@@ -406,15 +400,9 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
     // on its own line.
     os << '\n';
     bool acceptor = false, write_one = false;
-#if OPENFST_VER >= 10400
     fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
                                         t.OutputSymbols(),
                                         NULL, acceptor, write_one, "\t");
-#else
-    fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
-                                        t.OutputSymbols(),
-                                        NULL, acceptor, write_one);
-#endif
     printer.Print(&os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
@@ -511,7 +499,7 @@ bool LatticeHolder::Read(std::istream &is) {
   } else {
     return ReadLattice(is, true, &t_);
   }
-}     
+}
 
 
 
diff --git a/src/lat/push-lattice-test.cc b/src/lat/push-lattice-test.cc
index ecd60501888..cc9ae827a86 100644
--- a/src/lat/push-lattice-test.cc
+++ b/src/lat/push-lattice-test.cc
@@ -90,23 +90,13 @@ void TestPushCompactLatticeWeights() {
     }
     if (!ApproxEqual(sum, LatticeWeight::One())) {
       {
-#if OPENFST_VER >= 10400
         fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
                                                    NULL, true, true, "\t");
-#else
-        fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
-                                                   NULL, true, true);
-#endif
         printer.Print(&std::cerr, "<unknown>");
       }
       {
-#if OPENFST_VER >= 10400
         fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
                                                    NULL, true, true, "\t");
-#else
-        fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
-                                                   NULL, true, true);
-#endif
         printer.Print(&std::cerr, "<unknown>");
       }
       KALDI_ERR << "Bad lattice being pushed.";
diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc
index 365be941a85..b9b261f7d36 100644
--- a/src/latbin/lattice-compose.cc
+++ b/src/latbin/lattice-compose.cc
@@ -85,12 +85,8 @@ int main(int argc, char *argv[]) {
       if (phi_label > 0)
         PropagateFinal(phi_label, fst2);
 
-#if OPENFST_VER >= 10500
       fst::CacheOptions cache_opts(true, num_states_cache);
       fst::MapFstOptions mapfst_opts(cache_opts);
-#else
-      fst::CacheOptions mapfst_opts(true, num_states_cache);
-#endif
       fst::StdToLatticeMapper<BaseFloat> mapper;
       fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
           mapped_fst2(*fst2, mapper, mapfst_opts);
diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc
index d60d5fe93e5..2e5406f75de 100644
--- a/src/latbin/lattice-lmrescore.cc
+++ b/src/latbin/lattice-lmrescore.cc
@@ -74,12 +74,8 @@ int main(int argc, char *argv[]) {
     // mapped_fst is the LM fst interpreted using the LatticeWeight semiring,
     // with all the cost on the first member of the pair (since it's a graph
     // weight).
-#if OPENFST_VER >= 10500
     fst::CacheOptions cache_opts(true, num_states_cache);
     fst::MapFstOptions mapfst_opts(cache_opts);
-#else
-    fst::CacheOptions mapfst_opts(true, num_states_cache);
-#endif
     fst::StdToLatticeMapper<BaseFloat> mapper;
     fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
         lm_fst(*std_lm_fst, mapper, mapfst_opts);

From fb5b512d6997241359dc20edf5ecf0f3d073f35c Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 6 Dec 2016 00:16:07 -0800
Subject: [PATCH 074/213] Remove obsolete OpenFst version checks in Darwin
 makefiles.

---
 src/makefiles/darwin_10_10.mk | 12 +++---------
 src/makefiles/darwin_10_11.mk | 12 +++---------
 src/makefiles/darwin_10_12.mk | 12 +++---------
 src/makefiles/darwin_10_9.mk  | 12 +++---------
 4 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/src/makefiles/darwin_10_10.mk b/src/makefiles/darwin_10_10.mk
index 77d82708b1e..aeff69d4953 100644
--- a/src/makefiles/darwin_10_10.mk
+++ b/src/makefiles/darwin_10_10.mk
@@ -1,7 +1,7 @@
 # makefiles/darwin_10_10.mk contains Darwin-specific rules for OS X 10.10.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
@@ -17,7 +17,7 @@ CXXFLAGS += -msse -msse2 -Wall -I.. \
 
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -g
@@ -32,16 +32,10 @@ AR = ar
 COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
 endif
 
 # We need to tell recent versions of g++ to allow vector conversions without
 # an explicit cast provided the vectors are of the same size.
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
+  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif
diff --git a/src/makefiles/darwin_10_11.mk b/src/makefiles/darwin_10_11.mk
index c3b11a49cfc..40ee3adf6d0 100644
--- a/src/makefiles/darwin_10_11.mk
+++ b/src/makefiles/darwin_10_11.mk
@@ -1,7 +1,7 @@
 # makefiles/darwin_10_11.mk contains Darwin-specific rules for OS X 10.11.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
@@ -17,7 +17,7 @@ CXXFLAGS += -msse -msse2 -Wall -I.. \
 
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -g
@@ -32,16 +32,10 @@ AR = ar
 COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
 endif
 
 # We need to tell recent versions of g++ to allow vector conversions without
 # an explicit cast provided the vectors are of the same size.
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
+  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif
diff --git a/src/makefiles/darwin_10_12.mk b/src/makefiles/darwin_10_12.mk
index 46e05cc3427..10acd2d8577 100644
--- a/src/makefiles/darwin_10_12.mk
+++ b/src/makefiles/darwin_10_12.mk
@@ -1,7 +1,7 @@
 # makefiles/darwin_10_12.mk contains Darwin-specific rules for OS X 10.12.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
@@ -17,7 +17,7 @@ CXXFLAGS += -msse -msse2 -Wall -I.. \
 
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -g
@@ -32,16 +32,10 @@ AR = ar
 COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
 endif
 
 # We need to tell recent versions of g++ to allow vector conversions without
 # an explicit cast provided the vectors are of the same size.
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
+  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif
diff --git a/src/makefiles/darwin_10_9.mk b/src/makefiles/darwin_10_9.mk
index f3e8817503e..ede1712e155 100644
--- a/src/makefiles/darwin_10_9.mk
+++ b/src/makefiles/darwin_10_9.mk
@@ -1,7 +1,7 @@
 # makefiles/darwin_10_9.mk contains Darwin-specific rules for OS X 10.9.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
@@ -17,7 +17,7 @@ CXXFLAGS += -msse -msse2 -Wall -I.. \
 
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -g
@@ -32,16 +32,10 @@ AR = ar
 COMPILER = $(shell $(CXX) -v 2>&1 )
 ifeq ($(findstring clang,$(COMPILER)),clang)
   CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-	OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-	ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
 endif
 
 # We need to tell recent versions of g++ to allow vector conversions without
 # an explicit cast provided the vectors are of the same size.
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
+  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif

From a41c27b41267525c6163884ea96947168d4100ea Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 6 Dec 2016 00:37:15 -0800
Subject: [PATCH 075/213] Remove support for OpenFst < 1.5.3 from
 tools/Makefile

---
 tools/INSTALL                       |   6 +-
 tools/Makefile                      |  40 +--
 tools/extras/openfst-1.3.4.patch    | 395 --------------------------
 tools/extras/openfst-1.4.1.patch    | 153 ----------
 tools/extras/openfst_gcc41up.patch  |  28 --
 tools/extras/openfstwin-1.3.4.patch | 421 ----------------------------
 6 files changed, 10 insertions(+), 1033 deletions(-)
 delete mode 100644 tools/extras/openfst-1.3.4.patch
 delete mode 100644 tools/extras/openfst-1.4.1.patch
 delete mode 100644 tools/extras/openfst_gcc41up.patch
 delete mode 100644 tools/extras/openfstwin-1.3.4.patch

diff --git a/tools/INSTALL b/tools/INSTALL
index b13d45826bd..0678e2c8815 100644
--- a/tools/INSTALL
+++ b/tools/INSTALL
@@ -18,11 +18,9 @@ build by supplying the "-j" option to make, e.g. to use 4 CPUs:
 
   make -j 4
 
-By default, Kaldi builds against OpenFst-1.3.4. If you want to build against
-OpenFst-1.4, edit the Makefile in this folder. Note that this change requires
-a relatively new compiler with C++11 support, e.g. gcc >= 4.6, clang >= 3.0.
+Kaldi builds against OpenFst >= 1.5.3 which requires a relatively new compiler
+with C++11 support, e.g. gcc >= 4.6, clang >= 3.0.
 
 In extras/, there are also various scripts to install extra bits and pieces that
 are used by individual example scripts.  If an example script needs you to run
 one of those scripts, it will tell you what to do.
-
diff --git a/tools/Makefile b/tools/Makefile
index 0f5af6c7452..b6687ad1540 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -4,28 +4,14 @@ CXX = g++
 # CXX = clang++  # Uncomment this line to build with Clang.
 CC = gcc    # used for sph2pipe
 
-OPENFST_VERSION = 1.3.4
-# Uncomment the next line to build with OpenFst-1.4.1.
-# OPENFST_VERSION = 1.4.1
-# Uncomment the next line to build with OpenFst-1.5.4.
-# OPENFST_VERSION = 1.5.4
-# Note: OpenFst >= 1.4 requires C++11 support, hence you will need to use a
+# Note: OpenFst >= 1.5.3 requires C++11 support, hence you will need to use a
 # relatively recent C++ compiler, e.g. gcc >= 4.6, clang >= 3.0.
+OPENFST_VERSION = 1.5.4
 
-# On Mac OS 10.9+, clang defaults to the new c++ standard library libc++.
-# Since OpenFst-1.3 uses stuff from the tr1 namespace, we need to tell clang
-# to use libstdc++ instead.
-ifeq ($(OPENFST_VERSION), 1.3.4)
-  COMPILER = $(shell $(CXX) -v 2>&1 )
-  ifeq ($(findstring clang,$(COMPILER)),clang)
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-else ifeq ($(OPENFST_VERSION), 1.4.1)
-else ifeq ($(OPENFST_VERSION), 1.5.4)
-else
-    $(error OpenFst version $(OPENFST_VERSION) is not supported. \
-            Supported versions: 1.3.4, 1.4.1, 1.5.4)
+OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")
+    $(error OpenFst-$(OPENFST_VERSION) is not supported. \
+            Supported versions: >= 1.5.3)
 endif
 
 all: check_required_programs sph2pipe atlas sclite openfst
@@ -78,7 +64,7 @@ openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile
 
 # Add the -O flag to CXXFLAGS on cygwin as it can fix the compilation error
 # "file too big".
-openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)/.patched | check_required_programs
+openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION) | check_required_programs
 # Note: OSTYPE path is probably dead for latest cygwin64 (installed on 2016/11/11).
 ifeq ($(OSTYPE),cygwin)
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
@@ -93,16 +79,6 @@ else
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 endif
 
-# patches for openfst. openfst_gcc41up.patch is a patch for openfst to
-# support multi-threading when compiling with gcc >= 4.1.
-openfst-$(OPENFST_VERSION)/.patched: | openfst-$(OPENFST_VERSION)
-ifneq ($(OPENFST_VERSION), 1.5.4)
-	cd openfst-$(OPENFST_VERSION)/; \
-	patch -p1 -N < ../extras/openfst-$(OPENFST_VERSION).patch;
-	$(CXX) -dumpversion | awk '{if(NR==1 && $$1>"4.1") print "cd openfst-$(OPENFST_VERSION)/src/include/fst; patch -c -p0 -N < ../../../../extras/openfst_gcc41up.patch"}' | sh -
-endif
-	touch $@
-
 openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
 	tar xozf openfst-$(OPENFST_VERSION).tar.gz
 
@@ -167,7 +143,7 @@ fortran_opt = $(shell gcc -v 2>&1 | perl -e '$$x = join(" ", <STDIN>); if($$x =~
 
 
 # note: you can uncomment the line that has USE_THREAD=1 and comment the line
-# that has USE_THREADE=0 if you want Open Blas to use multiple threads.  then
+# that has USE_THREAD=0 if you want Open Blas to use multiple threads.  then
 # you could set, for example, OPENBLAS_NUM_THREADS=2 in your path.sh so that the
 # runtime knows how many threads to use.  Note: if you ever get the error
 # "Program is Terminated. Because you tried to allocate too many memory
diff --git a/tools/extras/openfst-1.3.4.patch b/tools/extras/openfst-1.3.4.patch
deleted file mode 100644
index 41ce6d59221..00000000000
--- a/tools/extras/openfst-1.3.4.patch
+++ /dev/null
@@ -1,395 +0,0 @@
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
-
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
-
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
-
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
-
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
-
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
-
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
-
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
-
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
-
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
-
-
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
-
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
-
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
-
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
-
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
-
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
-
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
-
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
-
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
-
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
-
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
-
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
-
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
-
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
-
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
-
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) {
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
-
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
-
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
-+        split_el->prev->next = 0;
-+        split_el->prev = 0;
-         class_size_[class_id] = split_size_[class_id];
-         class_size_[new_class] = remainder;
--        split_el->prev->next = 0;
--        split_el->prev = 0;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
-
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
-
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
-
-
---- a/src/script/text-io.cc
-+++ b/src/script/text-io.cc
-@@ -84,7 +84,7 @@ bool WritePotentials(const string& filename,
-   if (!*strm)
-     LOG(ERROR) << "WritePotentials: Write failed: "
-                << (filename.empty() ? "standard output" : filename);
--  bool ret = *strm;
-+  bool ret = !strm->fail();
-   if (strm != &cout)
-     delete strm;
-   return ret;
-
---- a/src/include/fst/extensions/ngram/ngram-fst.h
-+++ b/src/include/fst/extensions/ngram/ngram-fst.h
-@@ -130,7 +130,7 @@
-     hdr.SetNumStates(num_states_);
-     WriteHeader(strm, opts, kFileVersion, &hdr);
-     strm.write(data_, Storage(num_states_, num_futures_, num_final_));
-+    return !strm.fail();
--    return strm;
-   }
-
-   StateId Start() const {
diff --git a/tools/extras/openfst-1.4.1.patch b/tools/extras/openfst-1.4.1.patch
deleted file mode 100644
index 5889191d1a0..00000000000
--- a/tools/extras/openfst-1.4.1.patch
+++ /dev/null
@@ -1,153 +0,0 @@
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
-
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
-
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
-
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, GALLIC_LEFT> > gfst;
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
-
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
-
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
-
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
-
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
-
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
-
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) {
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
-
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
-
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
-+        split_el->prev->next = 0;
-+        split_el->prev = 0;
-         class_size_[class_id] = split_size_[class_id];
-         class_size_[new_class] = remainder;
--        split_el->prev->next = 0;
--        split_el->prev = 0;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
-
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
-
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
-
-
diff --git a/tools/extras/openfst_gcc41up.patch b/tools/extras/openfst_gcc41up.patch
deleted file mode 100644
index 2a47c9b9bd0..00000000000
--- a/tools/extras/openfst_gcc41up.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-*** lock.h
-***************
-*** 78,85 ****
-    RefCounter() : count_(1) {}
-
-    int count() const { return count_; }
-!   int Incr() const { return ++count_; }
-!   int Decr() const {  return --count_; }
-
-   private:
-    mutable int count_;
---- 78,93 ----
-    RefCounter() : count_(1) {}
-
-    int count() const { return count_; }
-!
-! // below lines are modifications of openfst for multi-thrads support,
-! // from tools/extras/openfst_gcc41up.patch, applied by tools/Makefile,
-! // applicable to gcc 4.1 or above
-!   // int Incr() const { return ++count_; }
-!   // int Decr() const {  return --count_; }
-!
-!   int Incr() const { return __sync_add_and_fetch(&count_, 1); }
-!   int Decr() const { return __sync_sub_and_fetch(&count_, 1); }
-! // end modifications
-
-   private:
-    mutable int count_;
diff --git a/tools/extras/openfstwin-1.3.4.patch b/tools/extras/openfstwin-1.3.4.patch
deleted file mode 100644
index 2fbb1d1fc27..00000000000
--- a/tools/extras/openfstwin-1.3.4.patch
+++ /dev/null
@@ -1,421 +0,0 @@
-diff --git a/src/include/fst/compat.h b/src/include/fst/compat.h
-index 00e2dba..ff8bacc 100644
---- a/src/include/fst/compat.h
-+++ b/src/include/fst/compat.h
-@@ -37,7 +39,7 @@ typedef SSIZE_T ssize_t;
- 		  #pragma comment (lib, "openfst64.lib")
-     #else
-       #pragma comment (lib, "openfst.lib")
--    #endif		
-+    #endif
- 	#endif
- #endif
- #else
-diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
-index c4362f2..58cad44 100644
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
- 
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
- 
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
- 
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
- 
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
- 
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
- 
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
- 
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
- 
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
- 
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
- 
- 
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
- 
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
- 
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
- 
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
-diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
-index a7c3360..491ef7d 100644
---- a/src/include/fst/label-reachable.h
-+++ b/src/include/fst/label-reachable.h
-@@ -359,9 +359,9 @@ class LabelReachable {
-                iiter = intervals->begin();
-            iiter != intervals->end(); ++iiter) {
-         begin_low = LowerBound(aiter, end_low, aiter_end,
--                               aiter_input, iiter->begin);
-+                               aiter_input, iiter->begin_);
-         end_low = LowerBound(aiter, begin_low, aiter_end,
--                             aiter_input, iiter->end);
-+                             aiter_input, iiter->end_);
-         if (end_low - begin_low > 0) {
-           if (reach_begin_ < 0)
-             reach_begin_ = begin_low;
-diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
-index 3fbe3ba..6e9dd3d 100644
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
- 
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
- 
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
- 
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
-diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
-index dcee67b..40b849a 100644
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
- 
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
- 
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
- 
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
- 
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
- 
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
- 
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) { 
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
--
-+    
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
- 
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
--        class_size_[class_id] = split_size_[class_id];
--        class_size_[new_class] = remainder;
-         split_el->prev->next = 0;
-         split_el->prev = 0;
-+        class_size_[class_id] = split_size_[class_id];
-+        class_size_[new_class] = remainder;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
- 
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
- 
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
- 
- 
-diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
-index 6d0c971..1da922e 100644
---- a/src/include/fst/state-reachable.h
-+++ b/src/include/fst/state-reachable.h
-@@ -112,7 +112,7 @@ class IntervalReachVisitor {
-   void FinishState(StateId s, StateId p, const A *arc) {
-     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
-       vector<Interval> *intervals = (*isets_)[s].Intervals();
--      (*intervals)[0].end = index_;      // Update tree interval end
-+      (*intervals)[0].end_ = index_;      // Update tree interval end
-     }
-     (*isets_)[s].Normalize();
-     if (p != kNoStateId)

From 03853032b33f034447c396d12e91d4fafbc5d774 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 6 Dec 2016 12:03:07 -0800
Subject: [PATCH 076/213] Fix tools/Makefile to resolve travis failure.

---
 tools/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index b6687ad1540..787a69e90f5 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -56,8 +56,7 @@ openfst: openfst_compiled openfst-$(OPENFST_VERSION)/lib
 
 .PHONY: openfst_compiled
 openfst_compiled: openfst-$(OPENFST_VERSION)/Makefile
-	cd openfst-$(OPENFST_VERSION)/ && \
-	$(MAKE) install
+	$(MAKE) -C openfst-$(OPENFST_VERSION) install MAKEOVERRIDES=
 
 openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile
 	-cd openfst-$(OPENFST_VERSION) && [ -d lib64 ] && [ ! -d lib ] && ln -s lib64 lib

From c61296f18acd6156affe0a6486b0404bec271794 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 6 Dec 2016 15:54:15 -0800
Subject: [PATCH 077/213] Fix src/configure and add C++11 flag to
 makefiles/*.mk

---
 src/configure                       |  1 -
 src/makefiles/cygwin.mk             |  3 +--
 src/makefiles/darwin_10_10.mk       | 19 +++++++++----------
 src/makefiles/darwin_10_11.mk       | 19 +++++++++----------
 src/makefiles/darwin_10_12.mk       | 19 +++++++++----------
 src/makefiles/darwin_10_5.mk        |  9 ++++-----
 src/makefiles/darwin_10_6.mk        |  9 ++++-----
 src/makefiles/darwin_10_7.mk        |  9 ++++-----
 src/makefiles/darwin_10_8.mk        |  9 ++++-----
 src/makefiles/darwin_10_9.mk        | 19 +++++++++----------
 src/makefiles/linux_atlas.mk        |  4 ++--
 src/makefiles/linux_atlas_arm.mk    |  4 ++--
 src/makefiles/linux_clapack.mk      |  4 ++--
 src/makefiles/linux_clapack_arm.mk  |  4 ++--
 src/makefiles/linux_openblas.mk     |  9 ++++-----
 src/makefiles/linux_openblas_arm.mk |  9 ++++-----
 src/makefiles/linux_x86_64_mkl.mk   |  6 +++---
 17 files changed, 72 insertions(+), 84 deletions(-)

diff --git a/src/configure b/src/configure
index 736689dc868..d3e9d63760f 100755
--- a/src/configure
+++ b/src/configure
@@ -876,7 +876,6 @@ fi
 
 OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
 echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
-echo "CXXFLAGS += -std=c++0x" >> kaldi.mk
 
 # Most of the OS-specific steps below will append to kaldi.mk
 echo "Doing OS specific configurations ..."
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index 6da982e20a4..e8f926ab986 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -10,7 +10,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
     -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
     -I ../../tools/CLAPACK/ \
     -I $(FSTROOT)/include \
-    $(EXTRA_CXXFLAGS) \
+    -std=c++0x $(EXTRA_CXXFLAGS) \
     -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
@@ -24,4 +24,3 @@ CXX = g++
 CC = g++
 RANLIB = ranlib
 AR = ar
-
diff --git a/src/makefiles/darwin_10_10.mk b/src/makefiles/darwin_10_10.mk
index aeff69d4953..498180c6f99 100644
--- a/src/makefiles/darwin_10_10.mk
+++ b/src/makefiles/darwin_10_10.mk
@@ -5,14 +5,13 @@ ifndef FSTROOT
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
+      -std=c++0x $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
@@ -20,13 +19,6 @@ ifeq ($(KALDI_FLAVOR), dynamic)
   CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
 # Add no-mismatched-tags flag to suppress the annoying clang warnings
 # that are perfectly valid per spec.
 COMPILER = $(shell $(CXX) -v 2>&1 )
@@ -39,3 +31,10 @@ endif
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
   CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif
+
+LDFLAGS = -g
+LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
+CXX = g++
+CC = $(CXX)
+RANLIB = ranlib
+AR = ar
diff --git a/src/makefiles/darwin_10_11.mk b/src/makefiles/darwin_10_11.mk
index 40ee3adf6d0..a2bd5ad028a 100644
--- a/src/makefiles/darwin_10_11.mk
+++ b/src/makefiles/darwin_10_11.mk
@@ -5,14 +5,13 @@ ifndef FSTROOT
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
+      -std=c++0x $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
       -g # -O0 -DKALDI_PARANOID
 
 
@@ -20,13 +19,6 @@ ifeq ($(KALDI_FLAVOR), dynamic)
   CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
 # Add no-mismatched-tags flag to suppress the annoying clang warnings
 # that are perfectly valid per spec.
 COMPILER = $(shell $(CXX) -v 2>&1 )
@@ -39,3 +31,10 @@ endif
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
   CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif
+
+LDFLAGS = -g
+LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
+CXX = g++
+CC = $(CXX)
+RANLIB = ranlib
+AR = ar
diff --git a/src/makefiles/darwin_10_12.mk b/src/makefiles/darwin_10_12.mk
index 10acd2d8577..946788a3db0 100644
--- a/src/makefiles/darwin_10_12.mk
+++ b/src/makefiles/darwin_10_12.mk
@@ -5,14 +5,13 @@ ifndef FSTROOT
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
+      -std=c++0x $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
       -g # -O0 -DKALDI_PARANOID
 
 
@@ -20,13 +19,6 @@ ifeq ($(KALDI_FLAVOR), dynamic)
   CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
 # Add no-mismatched-tags flag to suppress the annoying clang warnings
 # that are perfectly valid per spec.
 COMPILER = $(shell $(CXX) -v 2>&1 )
@@ -39,3 +31,10 @@ endif
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
   CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif
+
+LDFLAGS = -g
+LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
+CXX = g++
+CC = $(CXX)
+RANLIB = ranlib
+AR = ar
diff --git a/src/makefiles/darwin_10_5.mk b/src/makefiles/darwin_10_5.mk
index 5a1353b3893..6f3e6605226 100644
--- a/src/makefiles/darwin_10_5.mk
+++ b/src/makefiles/darwin_10_5.mk
@@ -1,22 +1,21 @@
 # makefiles/darwin_10_5.mk contains Darwin-specific rules for OS X 10.5.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	  -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)  \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
+      -std=c++0x $(EXTRA_CXXFLAGS) \
       -gdwarf-2 # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -gdwarf-2
diff --git a/src/makefiles/darwin_10_6.mk b/src/makefiles/darwin_10_6.mk
index 50883335a9d..10398326126 100644
--- a/src/makefiles/darwin_10_6.mk
+++ b/src/makefiles/darwin_10_6.mk
@@ -1,22 +1,21 @@
 # makefiles/darwin_10_6.mk contains Darwin-specific rules for OS X 10.6.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	  -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
+      -std=c++0x $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -g -rdynamic
diff --git a/src/makefiles/darwin_10_7.mk b/src/makefiles/darwin_10_7.mk
index ad5a153f5a9..fd491a91968 100644
--- a/src/makefiles/darwin_10_7.mk
+++ b/src/makefiles/darwin_10_7.mk
@@ -1,23 +1,22 @@
 # makefiles/darwin_10_6.mk contains Darwin-specific rules for OS X 10.7.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	  -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
+      -std=c++0x $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -g -rdynamic
diff --git a/src/makefiles/darwin_10_8.mk b/src/makefiles/darwin_10_8.mk
index c89aea0f44f..54203882c5f 100644
--- a/src/makefiles/darwin_10_8.mk
+++ b/src/makefiles/darwin_10_8.mk
@@ -1,23 +1,22 @@
 # makefiles/darwin_10_8.mk contains Darwin-specific rules for OS X 10.8.*
 
 ifndef FSTROOT
-$(error FSTROOT not defined.)
+  $(error FSTROOT not defined.)
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	   -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
+      -std=c++0x $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
 ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+  CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -g -rdynamic
diff --git a/src/makefiles/darwin_10_9.mk b/src/makefiles/darwin_10_9.mk
index ede1712e155..c0d2adfd97f 100644
--- a/src/makefiles/darwin_10_9.mk
+++ b/src/makefiles/darwin_10_9.mk
@@ -5,14 +5,13 @@ ifndef FSTROOT
 endif
 
 DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
+      -std=c++0x $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
@@ -20,13 +19,6 @@ ifeq ($(KALDI_FLAVOR), dynamic)
   CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
 # Add no-mismatched-tags flag to suppress the annoying clang warnings
 # that are perfectly valid per spec.
 COMPILER = $(shell $(CXX) -v 2>&1 )
@@ -39,3 +31,10 @@ endif
 ifeq ($(findstring GCC,$(COMPILER)),GCC)
   CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
 endif
+
+LDFLAGS = -g
+LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
+CXX = g++
+CC = $(CXX)
+RANLIB = ranlib
+AR = ar
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index a0b757ed39a..9cf05d18b8d 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -21,8 +21,8 @@ CXXFLAGS = -msse -msse2 -Wall -I.. \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_ATLAS -I$(ATLASINC) \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 07d9e9f3385..07d3b7f5278 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -21,8 +21,8 @@ CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_ATLAS -I$(ATLASINC) \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 83ec0ddce82..de2f1b85aa2 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -7,8 +7,8 @@ CXXFLAGS = -msse -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK -I ../../tools/CLAPACK \
       -I ../../tools/openfst/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 94e6ee25bf1..6c20c8734c9 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -7,8 +7,8 @@ CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK -I ../../tools/CLAPACK \
       -I ../../tools/openfst/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 7a4e2687664..307945222a7 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -14,22 +14,21 @@ endif
 
 
 DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. \
-           -pthread \
+CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
       -I $(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl 
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
 CC = g++
 CXX = g++
 AR = ar
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index e4c18e6b4d4..ec9dbd544f9 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -14,22 +14,21 @@ endif
 
 
 DOUBLE_PRECISION = 0
-CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. \
-           -pthread \
+CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. -pthread \
       -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
       -I $(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
 LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl 
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
 CC = g++
 CXX = g++
 AR = ar
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 7186f4bbb88..20ac2fac5df 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -1,9 +1,9 @@
 # You have to make sure MKLROOT and (optionally) MKLLIB is set
 
-# We have tested Kaldi with MKL version 10.2 on Linux/GCC and Intel(R) 64 
+# We have tested Kaldi with MKL version 10.2 on Linux/GCC and Intel(R) 64
 # architecture (also referred to as x86_64) with LP64 interface layer.
 
-# The linking flags for MKL will be very different depending on the OS, 
+# The linking flags for MKL will be very different depending on the OS,
 # architecture, compiler, etc. used. The correct flags can be obtained from
 # http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/
 # Use the options obtained from this website to manually configure for other
@@ -26,7 +26,7 @@ CXXFLAGS = -m64 -msse -msse2 -pthread -Wall -I.. \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_MKL -I$(MKLROOT)/include \
       -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
+      -std=c++0x $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)

From 8599be4808fbffcfa2797c5d6bce2858f97d1fbe Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 7 Dec 2016 00:07:20 -0800
Subject: [PATCH 078/213] Add check for compiler with C++11 support

---
 src/fstext/table-matcher.h          |  2 +-
 src/makefiles/cygwin.mk             |  2 +-
 src/makefiles/darwin_10_10.mk       |  2 +-
 src/makefiles/darwin_10_11.mk       |  2 +-
 src/makefiles/darwin_10_12.mk       |  2 +-
 src/makefiles/darwin_10_5.mk        |  2 +-
 src/makefiles/darwin_10_6.mk        |  2 +-
 src/makefiles/darwin_10_7.mk        |  2 +-
 src/makefiles/darwin_10_8.mk        |  2 +-
 src/makefiles/darwin_10_9.mk        |  2 +-
 src/makefiles/linux_atlas.mk        |  2 +-
 src/makefiles/linux_atlas_arm.mk    |  2 +-
 src/makefiles/linux_clapack.mk      |  2 +-
 src/makefiles/linux_clapack_arm.mk  |  2 +-
 src/makefiles/linux_openblas.mk     |  2 +-
 src/makefiles/linux_openblas_arm.mk |  2 +-
 src/makefiles/linux_x86_64_mkl.mk   |  2 +-
 tools/Makefile                      |  9 +++++----
 tools/extras/check_dependencies.sh  | 11 ++++++++++-
 19 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index 792fe98fe83..3e704879fb9 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -259,7 +259,7 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
                bool safe = false)
       : impl_(matcher.impl_) {
     if (safe == true) {
-      KALDI_ERR << "TableMatcher: Safe copy not supported";
+      LOG(FATAL) << "TableMatcher: Safe copy not supported";
     }
   }
 
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index e8f926ab986..c6871e6802d 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -10,7 +10,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
     -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
     -I ../../tools/CLAPACK/ \
     -I $(FSTROOT)/include \
-    -std=c++0x $(EXTRA_CXXFLAGS) \
+    -std=c++11 $(EXTRA_CXXFLAGS) \
     -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/darwin_10_10.mk b/src/makefiles/darwin_10_10.mk
index 498180c6f99..c6d75dc69ae 100644
--- a/src/makefiles/darwin_10_10.mk
+++ b/src/makefiles/darwin_10_10.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
diff --git a/src/makefiles/darwin_10_11.mk b/src/makefiles/darwin_10_11.mk
index a2bd5ad028a..b0eba615a49 100644
--- a/src/makefiles/darwin_10_11.mk
+++ b/src/makefiles/darwin_10_11.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
+      -std=c++11 $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
       -g # -O0 -DKALDI_PARANOID
 
 
diff --git a/src/makefiles/darwin_10_12.mk b/src/makefiles/darwin_10_12.mk
index 946788a3db0..8721a33b304 100644
--- a/src/makefiles/darwin_10_12.mk
+++ b/src/makefiles/darwin_10_12.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
+      -std=c++11 $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
       -g # -O0 -DKALDI_PARANOID
 
 
diff --git a/src/makefiles/darwin_10_5.mk b/src/makefiles/darwin_10_5.mk
index 6f3e6605226..ae9f59a6f86 100644
--- a/src/makefiles/darwin_10_5.mk
+++ b/src/makefiles/darwin_10_5.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -gdwarf-2 # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/darwin_10_6.mk b/src/makefiles/darwin_10_6.mk
index 10398326126..880fff9973a 100644
--- a/src/makefiles/darwin_10_6.mk
+++ b/src/makefiles/darwin_10_6.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/darwin_10_7.mk b/src/makefiles/darwin_10_7.mk
index fd491a91968..6cdb7181f96 100644
--- a/src/makefiles/darwin_10_7.mk
+++ b/src/makefiles/darwin_10_7.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
diff --git a/src/makefiles/darwin_10_8.mk b/src/makefiles/darwin_10_8.mk
index 54203882c5f..8aa305c5c94 100644
--- a/src/makefiles/darwin_10_8.mk
+++ b/src/makefiles/darwin_10_8.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
diff --git a/src/makefiles/darwin_10_9.mk b/src/makefiles/darwin_10_9.mk
index c0d2adfd97f..ee3090f1036 100644
--- a/src/makefiles/darwin_10_9.mk
+++ b/src/makefiles/darwin_10_9.mk
@@ -11,7 +11,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 9cf05d18b8d..d985344f479 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -21,7 +21,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_ATLAS -I$(ATLASINC) \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 07d3b7f5278..3359ea5e626 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -21,7 +21,7 @@ CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_ATLAS -I$(ATLASINC) \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index de2f1b85aa2..d9cd6163ceb 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -7,7 +7,7 @@ CXXFLAGS = -msse -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK -I ../../tools/CLAPACK \
       -I ../../tools/openfst/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 6c20c8734c9..f155248862c 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -7,7 +7,7 @@ CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK -I ../../tools/CLAPACK \
       -I ../../tools/openfst/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 307945222a7..2d09bc2bcfc 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -20,7 +20,7 @@ CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
       -I $(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index ec9dbd544f9..3a72d96308f 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -20,7 +20,7 @@ CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. -pthread \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
       -I $(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 20ac2fac5df..7e9c13e6ac0 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -26,7 +26,7 @@ CXXFLAGS = -m64 -msse -msse2 -pthread -Wall -I.. \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_MKL -I$(MKLROOT)/include \
       -I$(FSTROOT)/include \
-      -std=c++0x $(EXTRA_CXXFLAGS) \
+      -std=c++11 $(EXTRA_CXXFLAGS) \
       -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
diff --git a/tools/Makefile b/tools/Makefile
index 787a69e90f5..eb62da22c4e 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,11 +1,12 @@
 # SHELL += -x
 
 CXX = g++
-# CXX = clang++  # Uncomment this line to build with Clang.
-CC = gcc    # used for sph2pipe
+CC = gcc         # used for sph2pipe
+# CXX = clang++  # Uncomment these lines
+# CC = clang     # to build with Clang.
 
-# Note: OpenFst >= 1.5.3 requires C++11 support, hence you will need to use a
-# relatively recent C++ compiler, e.g. gcc >= 4.6, clang >= 3.0.
+# Note: OpenFst >= 1.5.3 and Kaldi require a relatively recent C++ compiler
+# with C++11 support, e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
 OPENFST_VERSION = 1.5.4
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index f45402e810e..c1b4912c8d9 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -21,7 +21,16 @@ fi
 
 if ! which g++ >&/dev/null; then
   echo "$0: g++ is not installed."
-  add_packages gcc-c++ g++ gcc-c++
+  echo " You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+  # add_packages gcc-c++ g++ gcc-c++
+elif [[ $(g++ -v 2>&1) == *"GCC"* ]]; then
+  GCC_VER=$(g++ -dumpversion)
+  GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+  if [ $GCC_VER_NUM -lt 40700 ]; then
+    echo "$0: System default g++ ($GCC_VER) does not support C++11."
+    echo " You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+    # add_packages gcc-c++ g++ gcc-c++
+  fi
 fi
 
 if ! echo "#include <zlib.h>" | gcc -E - >&/dev/null; then

From 0d7dbd6c5d4ac667d35f5769a293303e98952694 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 7 Dec 2016 00:38:31 -0800
Subject: [PATCH 079/213] Update installation instructions.

---
 src/INSTALL    | 29 ++++++++++++++++++++---------
 tools/INSTALL  | 10 +++++++---
 tools/Makefile |  6 +++---
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/src/INSTALL b/src/INSTALL
index 3f7a01928ba..8decefe71c2 100644
--- a/src/INSTALL
+++ b/src/INSTALL
@@ -6,14 +6,25 @@ compilation, see ../windows/INSTALL.
 You must first have completed the installation steps in ../tools/INSTALL
 (compiling OpenFst; getting ATLAS and CLAPACK headers).
 
-The installation instructions are:
-./configure --shared
-make depend
-make
-
-Note that "make" takes a long time; you can speed it up by running make
-in parallel if you have multiple CPUs, for instance
- make depend -j 8
- make -j 8
+The installation instructions are
+
+  ./configure --shared
+  make depend
+  make
+
+Note that "make" takes a long time. You can speed it up by running make
+in parallel if you have multiple CPUs, e.g. to use 8 CPUs
+
+  make depend -j 8
+  make -j 8
+
+Kaldi requires a relatively recent C++ compiler with C++11 support,
+e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system
+default compiler does not support C++11, you can specify a C++11 compliant
+compiler by setting the CXX environment variable, e.g.
+
+  make depend CXX=g++-4.8
+  make CXX=g++-4.8
+
 For more information, see documentation at http://kaldi-asr.org/doc/
 and click on "The build process (how Kaldi is compiled)".
diff --git a/tools/INSTALL b/tools/INSTALL
index 0678e2c8815..7e5549294c8 100644
--- a/tools/INSTALL
+++ b/tools/INSTALL
@@ -14,12 +14,16 @@ Then run
   make
 
 If you have multiple CPUs and want to speed things up, you can do a parallel
-build by supplying the "-j" option to make, e.g. to use 4 CPUs:
+build by supplying the "-j" option to make, e.g. to use 4 CPUs
 
   make -j 4
 
-Kaldi builds against OpenFst >= 1.5.3 which requires a relatively new compiler
-with C++11 support, e.g. gcc >= 4.6, clang >= 3.0.
+OpenFst requires a relatively recent C++ compiler with C++11 support,
+e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system
+default compiler does not support C++11, you can specify a C++11 compliant
+compiler by setting the CXX environment variable, e.g.
+
+  make CXX=g++-4.8
 
 In extras/, there are also various scripts to install extra bits and pieces that
 are used by individual example scripts.  If an example script needs you to run
diff --git a/tools/Makefile b/tools/Makefile
index eb62da22c4e..f6fe7a45db8 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -5,9 +5,9 @@ CC = gcc         # used for sph2pipe
 # CXX = clang++  # Uncomment these lines
 # CC = clang     # to build with Clang.
 
-# Note: OpenFst >= 1.5.3 and Kaldi require a relatively recent C++ compiler
-# with C++11 support, e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
-OPENFST_VERSION = 1.5.4
+# Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
+# e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
+OPENFST_VERSION = 1.5.4  # Supported versions: >= 1.5.3
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")

From ef13e26a10b87513f0f6632b68500d22efe93177 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 7 Dec 2016 00:54:54 -0800
Subject: [PATCH 080/213] Remove a comment in tools/Makefile to resolve the
 build problem.

---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index f6fe7a45db8..772f8c18398 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -7,7 +7,7 @@ CC = gcc         # used for sph2pipe
 
 # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
-OPENFST_VERSION = 1.5.4  # Supported versions: >= 1.5.3
+OPENFST_VERSION = 1.5.4
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")

From bd85a3902d7103da20b6794f347013e9c3b921f4 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 7 Dec 2016 22:18:29 -0800
Subject: [PATCH 081/213] Add C++11 compliant compiler check and update
 installation instructions.

---
 tools/INSTALL                      | 30 ++++++++++---------
 tools/extras/check_dependencies.sh | 48 ++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/tools/INSTALL b/tools/INSTALL
index 7e5549294c8..1ca33f9c515 100644
--- a/tools/INSTALL
+++ b/tools/INSTALL
@@ -1,30 +1,32 @@
-
-To install the most important prerequisites for Kaldi:
-
- first do
+To check the prerequisites for Kaldi, first run
 
   extras/check_dependencies.sh
 
-to see if there are any system-level installations or modifications you need to do.
-Check the output carefully: there are some things that will make your life a lot
-easier if you fix them at this stage.
+and see if there are any system-level installations you need to do. Check the
+output carefully. There are some things that will make your life a lot easier
+if you fix them at this stage. If your system default C++ compiler is not
+supported, you can do the check with another compiler by setting the CXX
+environment variable, e.g.
+
+  CXX=g++-4.8 extras/check_dependencies.sh
 
 Then run
 
   make
 
+which by default will install ATLAS headers, OpenFst, SCTK and sph2pipe.
+OpenFst requires a relatively recent C++ compiler with C++11 support, e.g.
+g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system default
+compiler does not have adequate support for C++11, you can specify a C++11
+compliant compiler as a command argument, e.g.
+
+  make CXX=g++-4.8
+
 If you have multiple CPUs and want to speed things up, you can do a parallel
 build by supplying the "-j" option to make, e.g. to use 4 CPUs
 
   make -j 4
 
-OpenFst requires a relatively recent C++ compiler with C++11 support,
-e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system
-default compiler does not support C++11, you can specify a C++11 compliant
-compiler by setting the CXX environment variable, e.g.
-
-  make CXX=g++-4.8
-
 In extras/, there are also various scripts to install extra bits and pieces that
 are used by individual example scripts.  If an example script needs you to run
 one of those scripts, it will tell you what to do.
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index c1b4912c8d9..3c26fd53e82 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -1,4 +1,7 @@
-#!/bin/bash
+#!/usr/bin/env bash
+
+CXX=${CXX:-g++}
+status=0
 
 # at some point we could try to add packages for Cywgin or macports(?) to this
 # script.
@@ -19,17 +22,36 @@ if ! which which >&/dev/null; then
   add_packages which debianutils which
 fi
 
-if ! which g++ >&/dev/null; then
-  echo "$0: g++ is not installed."
-  echo " You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
-  # add_packages gcc-c++ g++ gcc-c++
-elif [[ $(g++ -v 2>&1) == *"GCC"* ]]; then
-  GCC_VER=$(g++ -dumpversion)
-  GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-  if [ $GCC_VER_NUM -lt 40700 ]; then
-    echo "$0: System default g++ ($GCC_VER) does not support C++11."
-    echo " You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
-    # add_packages gcc-c++ g++ gcc-c++
+if ! which $CXX >&/dev/null; then
+  echo "$0: $CXX is not installed."
+  echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+  status=1
+else
+  COMPILER_VER_INFO=$($CXX --version 2>/dev/null)
+  if [[ $COMPILER_VER_INFO == *"g++"* ]]; then
+    GCC_VER=$($CXX -dumpversion)
+    GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+    if [ $GCC_VER_NUM -lt 40700 ]; then
+      echo "$0: $CXX (g++-$GCC_VER) is not supported."
+      echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      status=1
+    fi
+  elif [[ $COMPILER_VER_INFO == *"Apple"* ]]; then
+    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
+    if [ $CLANG_VER_NUM -lt 500 ]; then
+      echo "$0: $CXX (Apple clang-$CLANG_VER) is not supported."
+      echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      status=1
+    fi
+  elif [[ $COMPILER_VER_INFO == *"LLVM"* ]]; then
+    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
+    if [ $CLANG_VER_NUM -lt 303 ]; then
+      echo "$0: $CXX (LLVM clang-$CLANG_VER) is not supported."
+      echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      status=1
+    fi
   fi
 fi
 
@@ -141,7 +163,7 @@ fi
 if [ ! -z "$debian_packages" ]; then
   # If the list of packages to be installed is nonempty,
   # we'll exit with error status.  Check this outside of
-  # hecking for yum or apt-get, as we want it to exit with
+  # checking for yum or apt-get, as we want it to exit with
   # error even if we're not on Debian or red hat.
   status=1
 fi

From da7d11fbe866200837cf2518a27ac6a0faac550f Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 15:56:01 -0800
Subject: [PATCH 082/213] Refactor makefiles/*.mk

---
 src/makefiles/common.mk             | 35 ++++++++++++++-----
 src/makefiles/cuda_32bit.mk         |  5 ++-
 src/makefiles/cuda_64bit.mk         |  5 +--
 src/makefiles/cygwin.mk             | 28 +++------------
 src/makefiles/darwin.mk             | 17 +++++++++
 src/makefiles/darwin_10_10.mk       | 40 ---------------------
 src/makefiles/darwin_10_11.mk       | 40 ---------------------
 src/makefiles/darwin_10_12.mk       | 40 ---------------------
 src/makefiles/darwin_10_5.mk        | 26 --------------
 src/makefiles/darwin_10_6.mk        | 26 --------------
 src/makefiles/darwin_10_7.mk        | 27 ---------------
 src/makefiles/darwin_10_8.mk        | 27 ---------------
 src/makefiles/darwin_10_9.mk        | 40 ---------------------
 src/makefiles/default_rules.mk      | 54 ++++++++++++++++-------------
 src/makefiles/linux_atlas.mk        | 32 +++--------------
 src/makefiles/linux_atlas_arm.mk    | 32 +++--------------
 src/makefiles/linux_clapack.mk      | 27 ++++-----------
 src/makefiles/linux_clapack_arm.mk  | 27 ++++-----------
 src/makefiles/linux_openblas.mk     | 32 ++++-------------
 src/makefiles/linux_openblas_arm.mk | 32 ++++-------------
 src/makefiles/linux_x86_64_mkl.mk   | 30 +++-------------
 21 files changed, 122 insertions(+), 500 deletions(-)
 create mode 100644 src/makefiles/darwin.mk
 delete mode 100644 src/makefiles/darwin_10_10.mk
 delete mode 100644 src/makefiles/darwin_10_11.mk
 delete mode 100644 src/makefiles/darwin_10_12.mk
 delete mode 100644 src/makefiles/darwin_10_5.mk
 delete mode 100644 src/makefiles/darwin_10_6.mk
 delete mode 100644 src/makefiles/darwin_10_7.mk
 delete mode 100644 src/makefiles/darwin_10_8.mk
 delete mode 100644 src/makefiles/darwin_10_9.mk

diff --git a/src/makefiles/common.mk b/src/makefiles/common.mk
index 3a464ea99a1..93f6d98c471 100644
--- a/src/makefiles/common.mk
+++ b/src/makefiles/common.mk
@@ -1,13 +1,30 @@
-# Rules that enable valgrind debugging ("make valgrind")
+# Platform independent settings
 
-valgrind: .valgrind
+ifndef FSTROOT
+$(error FSTROOT not defined.)
+endif
 
-.valgrind:
-	echo -n > valgrind.out
-	for x in $(TESTFILES); do echo $$x>>valgrind.out; valgrind ./$$x >/dev/null 2>> valgrind.out; done
-	! ( grep 'ERROR SUMMARY' valgrind.out | grep -v '0 errors' )
-	! ( grep 'definitely lost' valgrind.out | grep -v -w 0 )
-	rm valgrind.out
-	touch .valgrind
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
 
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 
+CXXFLAGS = -std=c++11 -I.. -I$(FSTROOT)/include \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           $(EXTRA_CXXFLAGS) \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = $(OPENFSTLDFLAGS) $(EXTRA_LDFLAGS)
+LDLIBS = $(OPENFSTLIBS) -lm -lpthread -ldl $(EXTRA_LDLIBS)
+
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
index 38d810acaa8..4c72451fed8 100644
--- a/src/makefiles/cuda_32bit.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,8 +1,11 @@
+ifndef CUDATKDIR
+$(error CUDATKDIR not defined.)
+endif
+
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
 
-
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA \
              -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index fc11c034d78..691fda6135b 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,13 +1,14 @@
+ifndef CUDATKDIR
+$(error CUDATKDIR not defined.)
+endif
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
 
-
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
              -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
 CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
 CUDA_LDLIBS += -lcublas -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
-
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index c6871e6802d..48f07e901cf 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -1,26 +1,6 @@
-# makefiles/kaldi.mk.cygwin contains Cygwin-specific rules
+# Cygwin settings
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
+CXXFLAGS += -msse -msse2 -DHAVE_CLAPACK -I ../../tools/CLAPACK/
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-    -DHAVE_CLAPACK -I ../../tools/CLAPACK/ \
-    -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-    -I ../../tools/CLAPACK/ \
-    -I $(FSTROOT)/include \
-    -std=c++11 $(EXTRA_CXXFLAGS) \
-    -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g --enable-auto-import
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -L/usr/lib/lapack \
-         --enable-auto-import -lcyglapack-0 -lcygblas-0 -lm -lpthread
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
+LDFLAGS += -g --enable-auto-import -L/usr/lib/lapack
+LDLIBS += -lcyglapack-0 -lcygblas-0
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
new file mode 100644
index 00000000000..62bc30c6136
--- /dev/null
+++ b/src/makefiles/darwin.mk
@@ -0,0 +1,17 @@
+# Darwin (macOS) settings
+
+CXXFLAGS += -msse -msse2 -pthread \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK
+
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+else ifeq ($(findstring GCC,$(COMPILER)),GCC)
+# Allow implicit conversions between vectors.
+CXXFLAGS += -flax-vector-conversions
+endif
+
+LDFLAGS += -g
+LDLIBS += -framework Accelerate
diff --git a/src/makefiles/darwin_10_10.mk b/src/makefiles/darwin_10_10.mk
deleted file mode 100644
index c6d75dc69ae..00000000000
--- a/src/makefiles/darwin_10_10.mk
+++ /dev/null
@@ -1,40 +0,0 @@
-# makefiles/darwin_10_10.mk contains Darwin-specific rules for OS X 10.10.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_11.mk b/src/makefiles/darwin_10_11.mk
deleted file mode 100644
index b0eba615a49..00000000000
--- a/src/makefiles/darwin_10_11.mk
+++ /dev/null
@@ -1,40 +0,0 @@
-# makefiles/darwin_10_11.mk contains Darwin-specific rules for OS X 10.11.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_12.mk b/src/makefiles/darwin_10_12.mk
deleted file mode 100644
index 8721a33b304..00000000000
--- a/src/makefiles/darwin_10_12.mk
+++ /dev/null
@@ -1,40 +0,0 @@
-# makefiles/darwin_10_12.mk contains Darwin-specific rules for OS X 10.12.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_5.mk b/src/makefiles/darwin_10_5.mk
deleted file mode 100644
index ae9f59a6f86..00000000000
--- a/src/makefiles/darwin_10_5.mk
+++ /dev/null
@@ -1,26 +0,0 @@
-# makefiles/darwin_10_5.mk contains Darwin-specific rules for OS X 10.5.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)  \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -gdwarf-2 # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -gdwarf-2
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++-4
-CC = g++-4
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_6.mk b/src/makefiles/darwin_10_6.mk
deleted file mode 100644
index 880fff9973a..00000000000
--- a/src/makefiles/darwin_10_6.mk
+++ /dev/null
@@ -1,26 +0,0 @@
-# makefiles/darwin_10_6.mk contains Darwin-specific rules for OS X 10.6.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g -rdynamic
-LDLIBS =  $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_7.mk b/src/makefiles/darwin_10_7.mk
deleted file mode 100644
index 6cdb7181f96..00000000000
--- a/src/makefiles/darwin_10_7.mk
+++ /dev/null
@@ -1,27 +0,0 @@
-# makefiles/darwin_10_6.mk contains Darwin-specific rules for OS X 10.7.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g -rdynamic
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_8.mk b/src/makefiles/darwin_10_8.mk
deleted file mode 100644
index 8aa305c5c94..00000000000
--- a/src/makefiles/darwin_10_8.mk
+++ /dev/null
@@ -1,27 +0,0 @@
-# makefiles/darwin_10_8.mk contains Darwin-specific rules for OS X 10.8.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g -rdynamic
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_9.mk b/src/makefiles/darwin_10_9.mk
deleted file mode 100644
index ee3090f1036..00000000000
--- a/src/makefiles/darwin_10_9.mk
+++ /dev/null
@@ -1,40 +0,0 @@
-# makefiles/darwin_10_9.mk contains Darwin-specific rules for OS X 10.9.*
-
-ifndef FSTROOT
-  $(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-  CXXFLAGS += -fPIC
-endif
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-  CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 17f122622f1..fda52521186 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -3,28 +3,20 @@ SHELL := /bin/bash
 
 ifeq ($(KALDI_FLAVOR), dynamic)
   ifeq ($(shell uname), Darwin)
-    XLDLIBS := $(LDLIBS)
     ifdef LIBNAME
       LIBFILE = lib$(LIBNAME).dylib
-      #LDLIBS  += -l$(LIBNAME)
     endif
-    LDFLAGS += -L$(KALDILIBDIR) -Wl,-rpath -Wl,$(KALDILIBDIR)
-    XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))/lib$(notdir $(basename $(dep))).dylib )
-    XLDLIBS += $(foreach dep,$(ADDLIBS), -l$(notdir $(basename $(dep))) )
-  else
-    ifeq ($(shell uname), Linux)
-      ifdef LIBNAME
-        LIBFILE = lib$(LIBNAME).so
-        #LDLIBS  += -l$(LIBNAME)
-      endif
-      LDFLAGS += -Wl,-rpath=$(shell readlink -f $(KALDILIBDIR)) -L.
-      LDFLAGS += $(foreach dep,$(ADDLIBS), -L$(dir $(dep)) )
-      XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))/lib$(notdir $(basename $(dep))).so )
-    else  # Platform not supported
-      $(error Dynamic libraries not supported on this platform. Run configure with --static flag. )
+    LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
+    XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
+  else ifeq ($(shell uname), Linux)
+    ifdef LIBNAME
+      LIBFILE = lib$(LIBNAME).so
     endif
+    LDFLAGS += -Wl,-rpath=$(shell readlink -f $(KALDILIBDIR))
+    XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).so)
+  else  # Platform not supported
+    $(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
   endif
-  LDLIBS  += $(foreach dep,$(ADDLIBS), -l$(notdir $(basename $(dep))) )
 else
   ifdef LIBNAME
     LIBFILE = $(LIBNAME).a
@@ -39,24 +31,24 @@ $(LIBFILE): $(OBJFILES)
 	$(RANLIB) $(LIBNAME).a
 ifeq ($(KALDI_FLAVOR), dynamic)
 ifeq ($(shell uname), Darwin)
-	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ -framework Accelerate $(LDFLAGS) $(XLDLIBS) $(OBJFILES) $(LDLIBS)
+	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(XDEPENDS) $(LDLIBS)
 	rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
-else
-ifeq ($(shell uname), Linux)
+else ifeq ($(shell uname), Linux)
 	# Building shared library from static (static was compiled with -fPIC)
 	$(CXX) -shared -o $@ -Wl,--no-undefined -Wl,--as-needed  -Wl,-soname=$@,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive  $(LDFLAGS) $(XDEPENDS) $(LDLIBS)
 	rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
-	#cp $@ $(KALDILIBDIR)
 else  # Platform not supported
-	$(error Dynamic libraries not supported on this platform. Run configure with --static flag. )
-endif
+	$(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
 endif
 endif
 
+# By default (GNU) make uses the C compiler $(CC) for linking object files even
+# if they were compiled from a C++ source. Below redefinition forces make to
+# use the C++ compiler $(CXX) instead.
+LINK.o = $(CXX) $(LDFLAGS) $(TARGET_ARCH)
 
 $(BINFILES): $(LIBFILE) $(XDEPENDS)
 
-
 # Rule below would expand to, e.g.:
 # ../base/kaldi-base.a:
 # 	make -c ../base kaldi-base.a
@@ -100,8 +92,20 @@ test: test_compile
 	done;				\
 	exit $$result; }
 
-.valgrind: $(BINFILES) $(TESTFILES)
+# Rules that enable valgrind debugging ("make valgrind")
+
+valgrind: .valgrind
 
+.valgrind: $(TESTFILES)
+	echo -n > valgrind.out
+	for x in $(TESTFILES); do \
+		echo $$x >>valgrind.out; \
+		valgrind ./$$x >/dev/null 2>> valgrind.out; \
+	done
+	! ( grep 'ERROR SUMMARY' valgrind.out | grep -v '0 errors' )
+	! ( grep 'definitely lost' valgrind.out | grep -v -w 0 )
+	rm valgrind.out
+	touch .valgrind
 
 depend:
 	-$(CXX) -M $(CXXFLAGS) *.cc > .depend.mk
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index d985344f479..1f366727821 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -1,8 +1,4 @@
-# You have to make sure ATLASLIBS is set...
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
+# ATLAS specific Linux settings
 
 ifndef ATLASINC
 $(error ATLASINC not defined.)
@@ -12,26 +8,8 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
+CXXFLAGS += -msse -msse2 -pthread -rdynamic \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC)
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. \
-	   -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_ATLAS -I$(ATLASINC) \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+LDFLAGS += -rdynamic
+LDLIBS += $(ATLASLIBS)
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 3359ea5e626..5f62f82d297 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -1,8 +1,4 @@
-# You have to make sure ATLASLIBS is set...
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
+# ATLAS specific Linux ARM settings
 
 ifndef ATLASINC
 $(error ATLASINC not defined.)
@@ -12,26 +8,8 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
+CXXFLAGS += -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC)
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. \
-	   -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_ATLAS -I$(ATLASINC) \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+LDFLAGS += -rdynamic
+LDLIBS += $(ATLASLIBS)
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index d9cd6163ceb..4d733bb207c 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -1,23 +1,8 @@
-# You have to make sure CLAPACKLIBS is set...
+# CLAPACK specific Linux settings
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -msse2 \
-      -Wno-sign-compare -Wno-unused-local-typedefs \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK -I ../../tools/CLAPACK \
-      -I ../../tools/openfst/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
+CXXFLAGS += -msse -msse2 -pthread -rdynamic \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
+            -DHAVE_CLAPACK -I ../../tools/CLAPACK
 
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+LDFLAGS += -rdynamic
+LDLIBS += $(ATLASLIBS)
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index f155248862c..7d3119a08c9 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -1,23 +1,8 @@
-# You have to make sure CLAPACKLIBS is set...
+# CLAPACK specific Linux ARM settings
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK -I ../../tools/CLAPACK \
-      -I ../../tools/openfst/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
+CXXFLAGS += -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
+            -DHAVE_CLAPACK -I ../../tools/CLAPACK
 
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+LDFLAGS += -rdynamic
+LDLIBS += $(ATLASLIBS)
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 2d09bc2bcfc..8636b43e38e 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -1,8 +1,4 @@
-# You have to make sure FSTROOT,OPENBLASROOT,OPENBLASLIBS are set...
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
+# OpenBLAS specific Linux settings
 
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
@@ -12,25 +8,9 @@ ifndef OPENBLASROOT
 $(error OPENBLASROOT not defined.)
 endif
 
+CXXFLAGS += -msse -msse2 -pthread -rdynamic \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
+            -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
-      -I $(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+LDFLAGS += -rdynamic
+LDLIBS += $(OPENBLASLIBS)
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 3a72d96308f..682d62b5154 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -1,8 +1,4 @@
-# You have to make sure FSTROOT,OPENBLASROOT,OPENBLASLIBS are set...
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
+# OpenBLAS specific Linux ARM settings
 
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
@@ -12,25 +8,9 @@ ifndef OPENBLASROOT
 $(error OPENBLASROOT not defined.)
 endif
 
+CXXFLAGS += -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
+            -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
-      -I $(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+LDFLAGS += -rdynamic
+LDLIBS += $(OPENBLASLIBS)
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 7e9c13e6ac0..5e93d393b3e 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -1,4 +1,4 @@
-# You have to make sure MKLROOT and (optionally) MKLLIB is set
+# MKL specific Linux settings
 
 # We have tested Kaldi with MKL version 10.2 on Linux/GCC and Intel(R) 64
 # architecture (also referred to as x86_64) with LP64 interface layer.
@@ -13,25 +13,10 @@ ifndef MKLROOT
 $(error MKLROOT not defined.)
 endif
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
 MKLLIB ?= $(MKLROOT)/lib/em64t
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -m64 -msse -msse2 -pthread -Wall -I.. \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_MKL -I$(MKLROOT)/include \
-      -I$(FSTROOT)/include \
-      -std=c++11 $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
+CXXFLAGS += -m64 -msse -msse2 -pthread -rdynamic \
+            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL -I$(MKLROOT)/include
 
 ## Use the following for STATIC LINKING of the SEQUENTIAL version of MKL
 MKL_STA_SEQ = $(MKLLIB)/libmkl_solver_lp64_sequential.a -Wl,--start-group \
@@ -53,10 +38,5 @@ MKL_DYN_MUL = -L$(MKLLIB) -lmkl_solver_lp64 -Wl,--start-group -lmkl_intel_lp64 \
 
 # MKLFLAGS = $(MKL_DYN_MUL)
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(MKLFLAGS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+LDFLAGS += -rdynamic
+LDLIBS += $(MKLFLAGS)

From bd825b7961cfce536aed4d3f673db87439e63148 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 15:56:50 -0800
Subject: [PATCH 083/213] Clean up configure script.

---
 src/configure | 478 +++++++++++++++++++++++++++-----------------------
 1 file changed, 255 insertions(+), 223 deletions(-)

diff --git a/src/configure b/src/configure
index d3e9d63760f..a5b4e34b100 100755
--- a/src/configure
+++ b/src/configure
@@ -1,11 +1,11 @@
 #!/bin/bash
-#
+
 # This configure script is hand-generated, not auto-generated.
 # It creates the file kaldi.mk, which is %included by the Makefiles
 # in the subdirectories.
-# The file kaldi.mk is editable by hand-- for example, you may want to
+# The file kaldi.mk is editable by hand -- for example, you may want to
 # remove the options -g -O0 -DKALDI_PARANOID, or edit the
-# -DKALDI_DOUBLE_PRECISION option (to be 1 not 0),
+# DOUBLE_PRECISION variable (to be 1 not 0).
 
 
 #  Example command lines:
@@ -23,10 +23,61 @@
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
 #                             # version of kaldi even on CUDA-enabled machine
 
-#This should be incremented after every significant change of the configure script
-#I.e. after each change that affects the kaldi.mk or the build system as whole
+# This should be incremented after any significant change to the configure
+# script, i.e. any change affecting kaldi.mk or the build system as a whole.
 CONFIGURE_VERSION=5
 
+if ! [ -x "$PWD/configure" ]; then
+  echo 'You must run "configure" from the src/ directory.'
+  exit 1
+fi
+
+function usage {
+  cat <<EOF
+'configure' configures Kaldi installation.
+
+Usage: [VAR=VALUE]... $0 [OPTION]...
+
+The default configuration is to build and link against static Kaldi libraries.
+OpenFst and Math libraries are linked dynamically.
+
+Configuration options:
+  --help               Display this help message and exit
+  --version            Display the version of 'configure' and exit
+  --static             Build and link against static libraries [default=no]
+  --shared             Build and link against shared libraries [default=no]
+  --use-cuda           Build with CUDA [default=yes]
+  --cudatk-dir=DIR     CUDA toolkit directory
+  --double-precision   Build with double precision numbers [default=no]
+  --static-fst         Build with static OpenFst libraries [default=no]
+  --fst-root=DIR       OpenFst root directory [default=../tools/openfst/]
+  --mathlib=LIB        Math library [default=ATLAS]
+                       Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
+  --static-math        Build with static math libraries [default=no]
+  --threaded-math      Build with multi-threaded math libraries [default=no]
+  --threaded-atlas     Build with multi-threaded ATLAS libraries [default=no]
+  --atlas-root=DIR     ATLAS root directory [default=../tools/ATLAS/]
+  --openblas-root=DIR  OpenBLAS root directory
+  --clapack-root=DIR   CLAPACK root directory
+  --mkl-root=DIR       MKL root directory
+  --mkl-libdir=DIR     MKL library directory
+  --mkl-threading=LIB  MKL threading layer [default=sequential]
+                       Supported layers: sequential, iomp, tbb, gomp.
+  --omp-libdir=DIR     OpenMP directory
+  --speex-root=DIR     SPEEX root directory
+  --speex-libdir=DIR   SPEEX library directory
+  --speex-incdir=DIR   SPEEX include directory
+
+Following environment variables can be used to override the compiler
+or to provide additional flags to the compiler/linker.
+  CXX         C++ compiler command
+  CXXFLAGS    Additional C++ compiler flags, e.g. -I<include-dir>
+  LDFLAGS     Additional linker flags, e.g. -L<lib-dir>
+  LDLIBS      Additional libraries to pass to the linker, e.g. -l<lib>
+
+EOF
+}
+
 function rel2abs {
   if [ ! -z "$1" ]; then
     local retval=`cd $1 2>/dev/null && pwd || exit 1`
@@ -50,42 +101,106 @@ function is_set {
   fi
 }
 
+function failure {
+  echo "***configure failed: $* ***" >&2
+  if [ -f kaldi.mk ]; then rm kaldi.mk; fi
+  exit 1;
+}
 
+function check_exists {
+  if [ ! -f $1 ]; then failure "$1 not found."; fi
+}
 
-##   First do some checks.  These verify that all the things are
-##   here that should be here.
-if ! [ -x "$PWD/configure" ]; then
-  echo 'You must run "configure" from the src/ directory.'
-  exit 1
-fi
+function check_compiler {
+  COMPILER=$1
+  if ! which $COMPILER >&/dev/null; then
+    failure "$COMPILER is not installed.
+             You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+  else
+    COMPILER_VER_INFO=$($COMPILER --version 2>/dev/null)
+    if [[ $COMPILER_VER_INFO == *"g++"* ]]; then
+      GCC_VER=$($COMPILER -dumpversion)
+      GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+      if [ $GCC_VER_NUM -lt 40700 ]; then
+        failure "$COMPILER (g++-$GCC_VER) is not supported.
+                 You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      elif [ $GCC_VER_NUM  == 40801 ] || [ $GCC_VER_NUM == 40802 ]; then
+        failure "$COMPILER (g++-$GCC_VER) is not supported.
+                 GCC 4.8.1 and 4.8.2 have a bug in the implementation of
+                 the nth_element algorithm provided by the standard library.
+                 This will cause Kaldi to crash (make test would fail).
+                 Please use another C++ compiler with C++11 support.
+                 You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      fi
+    elif [[ $COMPILER_VER_INFO == *"Apple"* ]]; then
+      CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+      CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
+      if [ $CLANG_VER_NUM -lt 500 ]; then
+        failure "$COMPILER (Apple clang-$CLANG_VER) is not supported.
+                 You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      fi
+    elif [[ $COMPILER_VER_INFO == *"LLVM"* ]]; then
+      CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+      CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
+      if [ $CLANG_VER_NUM -lt 303 ]; then
+        failure "$COMPILER (LLVM clang-$CLANG_VER) is not supported.
+                You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      fi
+    fi
+  fi
+}
 
-## Default locations for FST and linear algebra libraries.
-MATHLIB='ATLAS'
-ATLASROOT=`rel2abs ../tools/ATLAS/`
-FSTROOT=`rel2abs ../tools/openfst`
+function check_for_slow_expf {
+  cd probe
+  rm -f exp-test
+  make -f Makefile.slow_expf 1>/dev/null
+  ./exp-test
+  if [ $? -eq 1 ]; then
+    echo "*** WARNING: expf() seems to be slower than exp() on your machine. This is a known bug in old versions of glibc. Please consider updating glibc. ***"
+    echo "*** Kaldi will be configured to use exp() instead of expf() in base/kaldi-math.h Exp() routine for single-precision floats. ***"
+    echo "CXXFLAGS += -DKALDI_NO_EXPF" >> ../kaldi.mk
+  fi
+  cd ..
+}
+
+function check_library {
+  local libpath=$1
+  local libname=$2
+  local libext=$3
+  local full_libname="$libpath/$libname.$libext"
+  ##echo "Testing $full_libname" >&2
+  test -f "$full_libname" && return ;
+  return 1
+}
 
-# Avoid using any variables that are set in the shell.
+# If configuration sets any of these variables, we will switch the external
+# math library. Here we unset them so that we can check later.
 unset MKLROOT
 unset CLAPACKROOT
 unset OPENBLASROOT
 unset MKLLIBDIR
 
-function usage {
-  echo 'Usage: ./configure [--static|--shared] [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT]
-  [--openblas-root=OPENBLASROOOT] [--clapack-root=CLAPACKROOT] [--mkl-root=MKLROOT] [--mkl-libdir=MKLLIBDIR]
-  [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS]
-  [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR][--mkl-threading=sequential|iomp|tbb|gomp]';
-}
+# These environment variables are OK.
+CXX=${CXX:-g++}
+ENV_CXXFLAGS=$CXXFLAGS
+ENV_LDFLAGS=$LDFLAGS
+ENV_LDLIBS=$LDLIBS
 
-threaded_atlas=false #  By default, use the un-threaded version of ATLAS.
-threaded_math=${threaded_atlas}
-static_math=false
-static_fst=false
-use_cuda=true
+# Default configuration
 dynamic_kaldi=false
+use_cuda=true
+static_fst=false
+static_math=false
+threaded_atlas=false
 mkl_threading=sequential
+double_precision=false
+
+MATHLIB='ATLAS'
+ATLASROOT=`rel2abs ../tools/ATLAS/`
+FSTROOT=`rel2abs ../tools/openfst`
 
-cmd_line="$0 $@"  # Save the command line to include in kaldi.mk
+# Save the command line to include in kaldi.mk
+cmd_line="$0 $@"
 
 while [ $# -gt 0 ];
 do
@@ -104,6 +219,12 @@ do
     static_math=false;
     static_fst=false;
     shift ;;
+  --double-precision=yes)
+    double_precision=true;
+    shift ;;
+  --double-precision=no)
+    double_precision=false;
+    shift ;;
   --atlas-root=*)
     ATLASROOT=`read_dirname $1`;
     shift ;;
@@ -115,12 +236,10 @@ do
     shift ;;
   --threaded-math=yes)
     threaded_atlas=true;
-    threaded_math=true;
     mkl_threading=iomp
     shift ;;
   --threaded-math=no)
     threaded_atlas=false;
-    threaded_math=false;
     mkl_threading=sequential
     shift ;;
   --use-cuda=yes)
@@ -143,13 +262,11 @@ do
     shift ;;
   --mkl-threading=sequential)
     threaded_atlas=false;
-    threaded_math=false;
     mkl_threading=sequential;
     shift ;;
   --mkl-threading=*)
     mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`;
     threaded_atlas=true;
-    threaded_math=true;
     shift ;;
   --fst-root=*)
     FSTROOT=`read_dirname $1`;
@@ -172,7 +289,7 @@ do
   --speex-libdir=*)
     SPEEXLIBDIR=`read_dirname $1`;
     shift ;;
-  --speex-includedir=*)
+  --speex-incdir=*)
     SPEEXINCLUDEDIR=`read_dirname $1`;
     shift ;;
   --omp-libdir=*)
@@ -188,78 +305,18 @@ do
   esac
 done
 
-# the idea here is that if you change the configuration options from using
+# The idea here is that if you change the configuration options from using
 # CUDA to not using it, or vice versa, we want to recompile all parts of the
-# code that may use a GPU.  Touching this file is a way to force this.
+# code that may use a GPU. Touching this file is a way to force this.
 touch cudamatrix/cu-common.h 2>/dev/null
 
-function failure {
-  echo "***configure failed: $* ***" >&2
-  if [ -f kaldi.mk ]; then rm kaldi.mk; fi
-  exit 1;
-}
-
-function check_exists {
-  if [ ! -f $1 ]; then failure "$1 not found."; fi
-}
-
-function check_for_bad_gcc {
-  if which gcc >&/dev/null; then  # gcc is on the path
-    gcc_version=$(gcc -dumpspecs 2>&1 | grep -A1 -F '*version:' | grep -v version)
-    if [ "$gcc_version" == "4.8.2" ] || [ "$gcc_version" == "4.8.1" ]; then
-      echo "*** WARNING: your version of gcc seems to be 4.8.1 or 4.8.2. ***"
-      echo "*** These versions of gcc has a bug in nth_element ***"
-      echo "*** in its implementation of the standard library ***"
-      echo "*** This will cause Kaldi to crash (make test   ***"
-      echo "*** would fail). Please either upgrade or downgrade gcc. ***"
-      exit 1
-    fi
-  fi
-}
-
-function check_for_slow_expf {
-  cd probe
-  rm -f exp-test
-  make -f Makefile.slow_expf 1>/dev/null
-  ./exp-test
-  if [ $? -eq 1 ]; then
-    echo "*** WARNING: expf() seems to be slower than exp() on your machine. This is a known bug in old versions of glibc. Please consider updating glibc. ***"
-    echo "*** Kaldi will be configured to use exp() instead of expf() in base/kaldi-math.h Exp() routine for single-precision floats. ***"
-    echo "CXXFLAGS += -DKALDI_NO_EXPF" >> ../kaldi.mk
-  fi
-  cd ..
-}
-
-
-function exit_success {
-  check_for_bad_gcc;
-  check_for_slow_expf;
-  echo "SUCCESS"
-  exit 0;
-}
-
+# If one of these variables is set, switch the external math library.
+is_set $MKLLIBDIR && echo "Configuring KALDI to use MKL" && export MATHLIB="MKL"
+is_set $MKLROOT && echo "Configuring KALDI to use MKL"&& export MATHLIB="MKL"
+is_set $CLAPACKROOT && echo "Configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
+is_set $OPENBLASROOT && echo "Configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
 
-
-function check_library {
-  local libpath=$1
-  local libname=$2
-  local libext=$3
-  local full_libname="$libpath/$libname.$libext"
-  ##echo "Testing $full_libname" >&2
-  test -f "$full_libname" && return ;
-  return 1
-}
-
-
-
-#Check if at least one of these variables is set
-#If yes, we want to switch to using the MKL
-is_set $MKLLIBDIR && echo "Force-configuring KALDI to use MKL" && export MATHLIB="MKL"
-is_set $MKLROOT && echo "Force-configuring KALDI to use MKL"&& export MATHLIB="MKL"
-is_set $CLAPACKROOT && echo "Force-configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
-is_set $OPENBLASROOT && echo "Force-configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
-
-#MKL functions
+# MKL functions
 function linux_configure_mkllibdir {
   local mklroot=$1
 
@@ -278,7 +335,6 @@ function linux_configure_mkl_includes {
   failure "Could not find the MKL include directory"
 }
 
-
 function linux_configure_mkl_libraries {
   local mkllibdir=$1
   local static=$2
@@ -414,13 +470,13 @@ function linux_configure_mkl_threading {
   echo "$OMP_LINK_LINE"
 }
 
-##
-## CUDA is used only in selected directories including src/cudamatrix, src/nnet*
-## and src/chain*.  It is used to accelerate the neural network training, the
-## rest of kaldi runs on CPUs.
-##
+
+# CUDA is used only in selected directories including src/cudamatrix, src/nnet*
+# and src/chain*. It is used to accelerate the neural network training.
+# The rest of Kaldi runs on CPUs.
+
 function configure_cuda {
-  #check for CUDA toolkit in the system
+  # Check for CUDA toolkit in the system
   if [ ! -d  "$CUDATKDIR" ]; then
     for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do
       if [ -f $base/bin/nvcc ]; then
@@ -476,7 +532,7 @@ function configure_cuda {
 }
 
 function linux_configure_speex {
-  #check whether the user has called tools/extras/install_speex.sh or not
+  # Check whether the user has called tools/extras/install_speex.sh or not
   [ ! -z "$SPEEXROOT" ] || SPEEXROOT=`pwd`/../tools/speex
   [ ! -z "$SPEEXLIBDIR" ] || SPEEXLIBDIR="$SPEEXROOT"/lib
   [ ! -z "$SPEEXINCLUDEDIR" ] || SPEEXINCLUDEDIR="$SPEEXROOT"/include
@@ -513,17 +569,7 @@ function linux_configure_speex {
   fi
 }
 
-function fix_cxx_flag {
-  CXXCOMPILER=`grep "CXX = " kaldi.mk | awk '{print $3}'`
-  if [ $CXXCOMPILER=="g++" ]; then
-    $CXXCOMPILER -dumpversion | \
-    awk '{if(NR==1 && $1<"4.4") print "sed \"s/-Wno-unused-local-typedefs//g\" \
-    kaldi.mk > tmpf; mv tmpf kaldi.mk; "}' | sh -
-  fi
-}
-
-function linux_atlas_failure { # function we use when we couldn't find
-   # ATLAS libs.
+function linux_atlas_failure {
    echo ATLASINC = $ATLASROOT/include >> kaldi.mk
    echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
    if [[ "`uname -m`" == arm* ]]; then
@@ -533,7 +579,6 @@ function linux_atlas_failure { # function we use when we couldn't find
    else
      cat makefiles/linux_atlas.mk >> kaldi.mk
    fi
-   fix_cxx_flag
    echo "** $* ***"
    echo "**  ERROR   **"
    echo "** Configure cannot proceed automatically."
@@ -590,11 +635,9 @@ function linux_configure_debian_ubuntu {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
-  exit_success;
 }
 
 function linux_configure_debian_ubuntu3 {
@@ -615,11 +658,9 @@ function linux_configure_debian_ubuntu3 {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
-  exit_success;
 }
 
 function linux_configure_debian7 {
@@ -643,11 +684,9 @@ function linux_configure_debian7 {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
   echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
-  exit_success;
 }
 
 function linux_configure_redhat {
@@ -668,10 +707,8 @@ function linux_configure_redhat {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
-  exit_success;
 }
 
 function linux_configure_redhat_fat {
@@ -695,13 +732,10 @@ function linux_configure_redhat_fat {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
-  exit_success;
 }
 
-
 function linux_configure_static {
   if $threaded_atlas; then pt=pt; else pt=""; fi
 
@@ -754,11 +788,9 @@ function linux_configure_static {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
   $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
-  exit_success;
 }
 
 function linux_check_dynamic {
@@ -839,43 +871,71 @@ function linux_configure_dynamic {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
   $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  exit_success;
 }
 
 echo "Configuring ..."
 
-if [ ! -f makefiles/common.mk ]; then
-  failure makefiles/common.mk not found
-fi
-
-# back up the old one in case we modified it
+# Back up the old kaldi.mk in case we modified it
 if [ -f kaldi.mk ]; then
   echo "Backing up kaldi.mk to kaldi.mk.bak"
   cp kaldi.mk kaldi.mk.bak
 fi
 
+echo "Checking compiler $CXX ..."
+check_compiler $CXX
+
 printf "# This file was generated using the following command:\n# $cmd_line\n\n" > kaldi.mk
-cat makefiles/common.mk >> kaldi.mk
+echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
+echo >> kaldi.mk
+
+echo "# Configuration" >> kaldi.mk
+echo >> kaldi.mk
 if $dynamic_kaldi ; then
   KALDILIBDIR=`pwd`/lib
   echo "KALDI_FLAVOR := dynamic" >> kaldi.mk
   echo "KALDILIBDIR := $KALDILIBDIR" >> kaldi.mk
 fi
-echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
-echo "FSTROOT = $FSTROOT" >> kaldi.mk
-
-echo "Checking OpenFST library in $FSTROOT ..."
+if $double_precision; then
+  echo "DOUBLE_PRECISION = 1" >> kaldi.mk
+else
+  echo "DOUBLE_PRECISION = 0" >> kaldi.mk
+fi
+echo "Checking OpenFst library in $FSTROOT ..."
 if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
   failure "Could not find file $FSTROOT/include/fst/fst.h:
-  you may not have installed OpenFst.  See ../tools/INSTALL"
+  you may not have installed OpenFst. See ../tools/INSTALL"
 fi
-
+echo "FSTROOT = $FSTROOT" >> kaldi.mk
 OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
 echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
+if  $static_fst ; then
+  OPENFSTLIBS="$FSTROOT/lib/libfst.a"
+else
+  if [ "`uname`" == "Darwin"  ]; then
+    OPENFSTLIBS="$FSTROOT/lib/libfst.dylib"
+    OPENFSTLDFLAGS="-Wl,-rpath -Wl,${FSTROOT}/lib"
+  elif ["`uname`" == "Linux" ]; then
+    OPENFSTLIBS="$FSTROOT/lib/libfst.so"
+    OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
+  else
+    failure "Dynamic libraries not supported on this platform.
+             Run configure with --static --static-fst=no flag."
+  fi
+fi
+if [ ! -f "$OPENFSTLIBS" ]; then
+  failure "Static=[$static_fst] OpenFST library not found:  See ../tools/INSTALL"
+fi
+echo "OPENFSTLIBS = $OPENFSTLIBS" >> kaldi.mk
+echo "OPENFSTLDFLAGS = $OPENFSTLDFLAGS" >> kaldi.mk
+echo "CXX = $CXX" >> kaldi.mk
+echo >> kaldi.mk
+
+# Add platform independent settings
+cat makefiles/common.mk >> kaldi.mk
+echo >> kaldi.mk
 
 # Most of the OS-specific steps below will append to kaldi.mk
 echo "Doing OS specific configurations ..."
@@ -884,58 +944,34 @@ echo "Doing OS specific configurations ..."
 # which crashes on Darwin. Also the linear algebra libraries on Macs are
 # used differently (through the Accelerate framework) than on Linux.
 if [ "`uname`" == "Darwin"  ]; then
-  echo "On Darwin: checking for Accelerate framework ..."
+  echo "On Darwin: Checking for Accelerate framework ..."
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
-    failure "Need the Accelerate.framework to compile on Darwin."
+    failure "Need the Accelerate framework to compile on Darwin."
   fi
-  if [ ! -f $FSTROOT/lib/libfst.a ]; then
-    failure "Static OpenFST library not found:  See ../tools/INSTALL"
+  OSX_VER=`sw_vers | grep ProductVersion | awk '{print $2}' | awk '{split($0,a,"."); print a[1] "." a[2]; }'`
+  OSX_VER_NUM=$(echo $OSX_VER | sed 's/\./ /g' | xargs printf "%d%02d")
+  echo "Configuring for OS X version $OSX_VER ..."
+  if [ $OSX_VER_NUM -ge 1005 ]; then
+    cat makefiles/darwin.mk >> kaldi.mk
+  else
+    failure "Mac OS X version '$OSX_VER' is not supported."
   fi
-  # posix_memalign and gcc -rdynamic options not present on OS X 10.5.*
-  osx_ver=`sw_vers | grep ProductVersion | awk '{print $2}' | awk '{split($0,a,"."); print a[1] "." a[2]; }'`
-  echo "Configuring for OS X version $osx_ver ..."
-  if [ "$osx_ver" == "10.5" ]; then
-    check_exists makefiles/darwin_10_5.mk
-    cat makefiles/darwin_10_5.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.6" ]; then
-    check_exists makefiles/darwin_10_6.mk
-    cat makefiles/darwin_10_6.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.7" ]; then
-    check_exists makefiles/darwin_10_7.mk
-    cat makefiles/darwin_10_7.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.8" ]; then
-    check_exists makefiles/darwin_10_8.mk
-    cat makefiles/darwin_10_8.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.9" ]; then
-    check_exists makefiles/darwin_10_9.mk
-    cat makefiles/darwin_10_9.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.10" ]; then
-    check_exists makefiles/darwin_10_10.mk
-    cat makefiles/darwin_10_10.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.11" ]; then
-    check_exists makefiles/darwin_10_11.mk
-    cat makefiles/darwin_10_11.mk >> kaldi.mk
+
+  if [ $OSX_VER_NUM == 1011 ]; then
     echo "**BAD WARNING**: You are using OS X El Capitan.  Some versions of this OS"
     echo "**BAD WARNING**: have a bug in the BLAS implementation that affects Kaldi."
     echo "**BAD WARNING**: After compiling, cd to matrix/ and type 'make test'.  The"
     echo "**BAD WARNING**: test will fail if the problem exists in your version. "
     echo "**BAD WARNING**: Eventually this issue will be fixed by system updates from"
-    echo "**BAD WARNING**  Apple.  Unexplained crashes with reports of NaNs will"
-    echo "**BAD WARNING**  be caused by this bug, but some recipes will (sometimes) work."
+    echo "**BAD WARNING**: Apple.  Unexplained crashes with reports of NaNs will"
+    echo "**BAD WARNING**: be caused by this bug, but some recipes will (sometimes) work."
     sleep 1; echo -n .; sleep 1; echo -n .; sleep 1; echo .
-  elif [ "$osx_ver" == "10.12" ]; then
-    check_exists makefiles/darwin_10_12.mk
-    cat makefiles/darwin_10_12.mk >> kaldi.mk
-  else
-    failure "OS X version '$osx_ver' not supported"
   fi
+  echo "Successfully configured for Darwin with Accelerate framework."
   $use_cuda && configure_cuda
-  echo "Configuration succeeded for platform Darwin."
-  exit_success;
-fi
 
-if [ "`uname -o`" == "Cygwin"  ]; then
-  echo "On Cygwin: checking for linear algebra libraries ..."
+elif [ "`uname -o`" == "Cygwin"  ]; then
+  echo "On Cygwin: Checking for linear algebra libraries ..."
   if [ ! -f ../tools/CLAPACK/clapack.h ]; then
       failure "could not find file ../tools/CLAPACK/clapack.h"
   fi
@@ -943,25 +979,9 @@ if [ "`uname -o`" == "Cygwin"  ]; then
      failure "please first install package liblapack0"
   fi
   cat makefiles/cygwin.mk >> kaldi.mk
-  echo "Configuration succeeded for platform cygwin"
-  exit_success;
-fi
-
-if [ "`uname`" == "Linux" ]; then
-  if  $static_fst ; then
-    OPENFSTLIBS="$FSTROOT/lib/libfst.a"
-    fst_type='a'
-  else
-    OPENFSTLIBS="-L${FSTROOT}/lib -lfst"
-    OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
-    fst_type='so'
-  fi
-  if [ ! -f "$FSTROOT/lib/libfst.${fst_type}" ]; then
-    failure "Static=[$static_fst] OpenFST library not found:  See ../tools/INSTALL"
-  fi
-  echo OPENFSTLIBS = $OPENFSTLIBS >> kaldi.mk
-  echo OPENFSTLDFLAGS = $OPENFSTLDFLAGS >> kaldi.mk
+  echo "Successfully configured for Cygwin with CLAPACK."
 
+elif [ "`uname`" == "Linux" ]; then
   echo "On Linux: Checking for linear algebra header files ..."
   if [ "$MATHLIB" == "ATLAS" ]; then
     if [ ! -f $ATLASROOT/include/cblas.h ] || [ ! -f $ATLASROOT/include/clapack.h ] ; then
@@ -992,7 +1012,7 @@ if [ "`uname`" == "Linux" ]; then
         linux_configure_redhat || \
         linux_configure_redhat_fat 64 || \
         linux_configure_redhat_fat || \
-        linux_atlas_failure "Failed to configure ATLAS lbiraries";
+        linux_atlas_failure "Failed to configure ATLAS libraries";
     else
       # Prefer dynamic to static math.
       linux_configure_debian_ubuntu3 || \
@@ -1005,7 +1025,7 @@ if [ "`uname`" == "Linux" ]; then
         linux_configure_redhat || \
         linux_configure_redhat_fat 64 || \
         linux_configure_redhat_fat || \
-        linux_atlas_failure "Failed to configure ATLAS lbiraries";
+        linux_atlas_failure "Failed to configure ATLAS libraries";
     fi
 
   elif [ "$MATHLIB" == "MKL" ]; then
@@ -1052,26 +1072,23 @@ if [ "`uname`" == "Linux" ]; then
     fi
     check_exists makefiles/linux_x86_64_mkl.mk
     cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk
-    fix_cxx_flag
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
-
+    echo "Successfully configured for Linux with MKL libs from $MKLROOT"
     $use_cuda && configure_cuda
     linux_configure_speex
-    echo "Successfully configured for Linux with MKL libs from $MKLROOT"
-    exit_success;
 
   elif [ "$MATHLIB" == "CLAPACK" ]; then
     if [ -z "$CLAPACKROOT" ]; then
       failure "Must specify the location of CLAPACK with --clapack-root option (and it must exist)"
     fi
     if [ ! -f ../tools/CLAPACK/clapack.h ]; then
-      failure could not find file ../tools/CLAPACK/clapack.h
+      failure "could not find file ../tools/CLAPACK/clapack.h"
     fi
     if [ ! -d "$CLAPACKROOT" ]; then
       failure "The directory $CLAPACKROOT does not exist"
     fi
     # Also check for cblas.h and f2c.h
-    echo "Using CLAPACK as the linear algebra library."
+    echo "Using CLAPACK libs from $CLAPACKROOT as the linear algebra library."
     if [ ! -f makefiles/linux_clapack.mk ]; then
       failure "makefiles/linux_clapack.mk not found."
     fi
@@ -1080,12 +1097,11 @@ if [ "`uname`" == "Linux" ]; then
     else
       cat makefiles/linux_clapack.mk >> kaldi.mk
     fi
-    fix_cxx_flag
-    echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work."
+    echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
+    echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
     $use_cuda && configure_cuda
     linux_configure_speex
-    echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
-    exit_success;
+
   elif [ "$MATHLIB" == "OPENBLAS" ]; then
     OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
     if [ -z "$OPENBLASROOT" ]; then
@@ -1094,7 +1110,7 @@ if [ "`uname`" == "Linux" ]; then
     if [ ! -f $OPENBLASROOT/lib/libopenblas.so ]; then
       failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so"
     fi
-    echo "Your math library seems to be OpenBLAS.  Configuring appropriately."
+    echo "Your math library seems to be OpenBLAS from $OPENBLASROOT.  Configuring appropriately."
     if $static_math; then
       echo "Configuring static OpenBlas since --static-math=yes"
       OPENBLASLIBS="$OPENBLASROOT/lib/libopenblas.a -lgfortran"
@@ -1111,14 +1127,30 @@ if [ "`uname`" == "Linux" ]; then
     else
       cat makefiles/linux_openblas.mk >> kaldi.mk
     fi
-    fix_cxx_flag
+    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
     $use_cuda && configure_cuda
     linux_configure_speex
-    echo "Successfully configured OpenBLAS from $OPENBLASROOT."
-    exit_success;
+
   else
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
+else
+  failure "Could not detect the platform or we have not yet worked out the
+           appropriate configuration for this platform.
+           Please contact the developers."
 fi
 
-failure Could not detect platform or we have not yet worked out the appropriate configuration for this platform.  Please contact the developers.
+# Append the flags set by environment variables last so they can be used
+# to override the automatically generated configuration.
+echo >> kaldi.mk
+echo "# Environment settings" >> kaldi.mk
+echo >> kaldi.mk
+if [ -n "$ENV_CXXFLAGS" ]; then echo "CXXFLAGS += $ENV_CXXFLAGS" >> kaldi.mk; fi
+if [ -n "$ENV_LDFLAGS" ]; then echo "LDFLAGS += $ENV_LDFLAGS" >> kaldi.mk; fi
+if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
+
+# We check for slow exp implementation just before we exit. This check uses
+# and possibly modifies the kaldi.mk file that we just generated.
+check_for_slow_expf;
+echo "SUCCESS"
+exit 0;

From bf8aad8c7727c68565bde66ef6587c504011e364 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 15:57:14 -0800
Subject: [PATCH 084/213] Update travis script.

---
 tools/extras/travis_script.sh | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index a857f538edd..2b8652a1f25 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -45,23 +45,18 @@ then
   exit 0;
 fi
 
-# Prepare make command fragments.
+# Prepare environment variables
 CF="$CFLAGS -g $(addsw -I $INCDIRS)"
 LDF="$LDFLAGS $(addsw -L $LIBDIRS)"
-CCC="$(mtoken CC $CXX) $(mtoken CXX $CXX)"
+CCC="$(mtoken CXX $CXX)"
 
 runvx cd tools
 runvx make openfst $CCC CXXFLAGS="$CF" -j$MAXPAR
 cd ..
 runvx cd src
-runvx ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr
+runvx $CCC CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr
+runvx make all -j$MAXPAR
+runvx make test -k
 
-make_kaldi() {
-  runvx make "$@" $CCC EXTRA_CXXFLAGS="$CF" EXTRA_LDLIBS="$LDF"
-}
-
-#make_kaldi mklibdir base matrix -j$MAXPAR
-#make_kaldi matrix/test
-
-make_kaldi all -j$MAXPAR
-make_kaldi test -k
+#runvx make mklibdir base matrix -j$MAXPAR
+#runvx make matrix/test

From bbb7f5daf96e54078777e96f02c56c3c812f0372 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 15:58:31 -0800
Subject: [PATCH 085/213] Initialize a few variables to silence compiler
 warnings.

---
 src/nnet/nnet-utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnet/nnet-utils.h b/src/nnet/nnet-utils.h
index 869bb174f23..8b1afbbed3b 100644
--- a/src/nnet/nnet-utils.h
+++ b/src/nnet/nnet-utils.h
@@ -243,7 +243,7 @@ inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
   // loop over records,
   for (int32 i = 0; i < in.size(); i++) {
     // process i'th record,
-    int32 beg, end, step;
+    int32 beg = 0, end = 0, step = 1;
     switch (in[i].size()) {
       case 1:
         beg  = in[i][0];

From 41c77be1a30f62fb00aada1b4bc6e3186371a91a Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 16:11:12 -0800
Subject: [PATCH 086/213] Fix spacing error in configure.

---
 src/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index a5b4e34b100..b92729319a7 100755
--- a/src/configure
+++ b/src/configure
@@ -917,7 +917,7 @@ else
   if [ "`uname`" == "Darwin"  ]; then
     OPENFSTLIBS="$FSTROOT/lib/libfst.dylib"
     OPENFSTLDFLAGS="-Wl,-rpath -Wl,${FSTROOT}/lib"
-  elif ["`uname`" == "Linux" ]; then
+  elif [ "`uname`" == "Linux" ]; then
     OPENFSTLIBS="$FSTROOT/lib/libfst.so"
     OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
   else

From 817a314a2fd7d574b4ea6aa94c2f2e6bdafa4e44 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 16:55:12 -0800
Subject: [PATCH 087/213] Fix travis script.

---
 tools/extras/travis_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 2b8652a1f25..d347d5dfd58 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -17,7 +17,7 @@ TESTABLE_DIRS="src/"
 # Run verbose (run and echo) and exit if failed.
 runvx() {
   echo "\$ $@"
-  "$@" || exit 1
+  eval "$@" || exit 1
 }
 
 # $(addsw -L foo bar) => "-Lfoo -Lbar".

From 20b56451012adbcbcc057c14ac0f7914fd99c968 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 17:21:58 -0800
Subject: [PATCH 088/213] Yet another fix for the travis script.

---
 tools/extras/travis_script.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index d347d5dfd58..b0acca7e4cf 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -17,7 +17,7 @@ TESTABLE_DIRS="src/"
 # Run verbose (run and echo) and exit if failed.
 runvx() {
   echo "\$ $@"
-  eval "$@" || exit 1
+  "$@" || exit 1
 }
 
 # $(addsw -L foo bar) => "-Lfoo -Lbar".
@@ -54,7 +54,9 @@ runvx cd tools
 runvx make openfst $CCC CXXFLAGS="$CF" -j$MAXPAR
 cd ..
 runvx cd src
-runvx $CCC CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr
+# runvx does not work when we have environment variables as prefix
+echo "$CCC CXXFLAGS=$CF LDFLAGS=$LDF ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr"
+$CCC CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr" || exit 1
 runvx make all -j$MAXPAR
 runvx make test -k
 

From b4cc589a8bffd63166b1fbef3042096cc716425a Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 17:35:45 -0800
Subject: [PATCH 089/213] One more fix to travis script.

---
 tools/extras/travis_script.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index b0acca7e4cf..97dec920b44 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -55,8 +55,8 @@ runvx make openfst $CCC CXXFLAGS="$CF" -j$MAXPAR
 cd ..
 runvx cd src
 # runvx does not work when we have environment variables as prefix
-echo "$CCC CXXFLAGS=$CF LDFLAGS=$LDF ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr"
-$CCC CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr" || exit 1
+echo "CXX=$CXX CXXFLAGS=$CF LDFLAGS=$LDF ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr"
+CXX="$CXX" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr" || exit 1
 runvx make all -j$MAXPAR
 runvx make test -k
 

From 77ec36a7dcbbd6159f14015c698f56e69a8fa154 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 20:59:01 -0800
Subject: [PATCH 090/213] Quote environment variables defined in travis script
 to resolve the build error.

---
 tools/extras/travis_script.sh | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 97dec920b44..0164e8532ab 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -17,7 +17,7 @@ TESTABLE_DIRS="src/"
 # Run verbose (run and echo) and exit if failed.
 runvx() {
   echo "\$ $@"
-  "$@" || exit 1
+  eval "$@" || exit 1
 }
 
 # $(addsw -L foo bar) => "-Lfoo -Lbar".
@@ -46,17 +46,15 @@ then
 fi
 
 # Prepare environment variables
-CF="$CFLAGS -g $(addsw -I $INCDIRS)"
-LDF="$LDFLAGS $(addsw -L $LIBDIRS)"
-CCC="$(mtoken CXX $CXX)"
+CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
+LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
+CCC="\"$(mtoken CXX $CXX)\""
 
 runvx cd tools
-runvx make openfst $CCC CXXFLAGS="$CF" -j$MAXPAR
+runvx make openfst "$CCC" CXXFLAGS="$CF" -j$MAXPAR
 cd ..
 runvx cd src
-# runvx does not work when we have environment variables as prefix
-echo "CXX=$CXX CXXFLAGS=$CF LDFLAGS=$LDF ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr"
-CXX="$CXX" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr" || exit 1
+runvx "$CCC" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr"
 runvx make all -j$MAXPAR
 runvx make test -k
 

From ec76eca5d1dcffb9502d1554b3e64f88b6b2f642 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 21:24:21 -0800
Subject: [PATCH 091/213] Fix the quoting in mtoken function defined in travis
 script.

---
 tools/extras/travis_script.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 0164e8532ab..15e284f66a6 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -28,7 +28,7 @@ addsw() {
 }
 
 # $(mtoken CXX gcc) => "CXX=gcc"; # $(mtoken CXX ) => "".
-mtoken() { echo ${2+$1=$2}; }
+mtoken() { echo ${2+$1=\"$2\"}; }
 
 # Print machine info and environment.
 runvx uname -a
@@ -48,7 +48,7 @@ fi
 # Prepare environment variables
 CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
-CCC="\"$(mtoken CXX $CXX)\""
+CCC="$(mtoken CXX "$CXX")"
 
 runvx cd tools
 runvx make openfst "$CCC" CXXFLAGS="$CF" -j$MAXPAR

From e5a01206dbb761184484e02982af2362afbda6cd Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 15 Dec 2016 22:12:51 -0800
Subject: [PATCH 092/213] Run tests in parallel to avoid the travis timeout.

---
 tools/extras/travis_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 15e284f66a6..5aefdd3e543 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -56,7 +56,7 @@ cd ..
 runvx cd src
 runvx "$CCC" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr"
 runvx make all -j$MAXPAR
-runvx make test -k
+runvx make test -k -j$MAXPAR
 
 #runvx make mklibdir base matrix -j$MAXPAR
 #runvx make matrix/test

From 75105e92c6d1e64a749044730bc819f3875c010e Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 20 Dec 2016 23:19:07 -0800
Subject: [PATCH 093/213] Reorganize platform specific makefiles.

---
 src/makefiles/common.mk             | 30 --------------------------
 src/makefiles/cuda_32bit.mk         |  1 -
 src/makefiles/cuda_64bit.mk         |  1 -
 src/makefiles/cygwin.mk             | 31 ++++++++++++++++++++++++---
 src/makefiles/darwin.mk             | 32 +++++++++++++++++++++++-----
 src/makefiles/linux_atlas.mk        | 30 +++++++++++++++++++++-----
 src/makefiles/linux_atlas_arm.mk    | 30 +++++++++++++++++++++-----
 src/makefiles/linux_clapack.mk      | 31 ++++++++++++++++++++++-----
 src/makefiles/linux_clapack_arm.mk  | 31 ++++++++++++++++++++++-----
 src/makefiles/linux_openblas.mk     | 33 +++++++++++++++++++++++------
 src/makefiles/linux_openblas_arm.mk | 33 +++++++++++++++++++++++------
 src/makefiles/linux_x86_64_mkl.mk   | 29 +++++++++++++++++++++----
 12 files changed, 234 insertions(+), 78 deletions(-)
 delete mode 100644 src/makefiles/common.mk

diff --git a/src/makefiles/common.mk b/src/makefiles/common.mk
deleted file mode 100644
index 93f6d98c471..00000000000
--- a/src/makefiles/common.mk
+++ /dev/null
@@ -1,30 +0,0 @@
-# Platform independent settings
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-ifndef DOUBLE_PRECISION
-$(error DOUBLE_PRECISION not defined.)
-endif
-
-ifndef OPENFSTLIBS
-$(error OPENFSTLIBS not defined.)
-endif
-
-CXXFLAGS = -std=c++11 -I.. -I$(FSTROOT)/include \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-           $(EXTRA_CXXFLAGS) \
-           -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = $(OPENFSTLDFLAGS) $(EXTRA_LDFLAGS)
-LDLIBS = $(OPENFSTLIBS) -lm -lpthread -ldl $(EXTRA_LDLIBS)
-
-RANLIB = ranlib
-AR = ar
-AS = as
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
index 4c72451fed8..4019d5027b1 100644
--- a/src/makefiles/cuda_32bit.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,7 +1,6 @@
 ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
-
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index 691fda6135b..0ce7bacdd00 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,7 +1,6 @@
 ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
-
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index 48f07e901cf..beaea294638 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -1,6 +1,31 @@
 # Cygwin settings
 
-CXXFLAGS += -msse -msse2 -DHAVE_CLAPACK -I ../../tools/CLAPACK/
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 
-LDFLAGS += -g --enable-auto-import -L/usr/lib/lapack
-LDLIBS += -lcyglapack-0 -lcygblas-0
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_CLAPACK -I ../../tools/CLAPACK/ \
+           -msse -msse2 \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g \
+          --enable-auto-import -L/usr/lib/lapack
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -lcyglapack-0 -lcygblas-0 \
+         -lm -lpthread -ldl
+
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 62bc30c6136..2f1692018ac 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -1,7 +1,32 @@
 # Darwin (macOS) settings
 
-CXXFLAGS += -msse -msse2 -pthread \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
+           -msse -msse2 -pthread \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
+
+RANLIB = ranlib
+AR = ar
+AS = as
 
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
@@ -12,6 +37,3 @@ else ifeq ($(findstring GCC,$(COMPILER)),GCC)
 # Allow implicit conversions between vectors.
 CXXFLAGS += -flax-vector-conversions
 endif
-
-LDFLAGS += -g
-LDLIBS += -framework Accelerate
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 1f366727821..3fbeab9bed3 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -1,15 +1,35 @@
 # ATLAS specific Linux settings
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 ifndef ATLASINC
 $(error ATLASINC not defined.)
 endif
-
 ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS += -msse -msse2 -pthread -rdynamic \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC)
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
+           -msse -msse2 -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
 
-LDFLAGS += -rdynamic
-LDLIBS += $(ATLASLIBS)
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 5f62f82d297..0bbcbdd2acd 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -1,15 +1,35 @@
 # ATLAS specific Linux ARM settings
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 ifndef ATLASINC
 $(error ATLASINC not defined.)
 endif
-
 ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS += -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC)
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
 
-LDFLAGS += -rdynamic
-LDLIBS += $(ATLASLIBS)
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 4d733bb207c..60fbf4918e3 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -1,8 +1,29 @@
 # CLAPACK specific Linux settings
 
-CXXFLAGS += -msse -msse2 -pthread -rdynamic \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-            -DHAVE_CLAPACK -I ../../tools/CLAPACK
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 
-LDFLAGS += -rdynamic
-LDLIBS += $(ATLASLIBS)
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
+           -msse -msse2 -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
+
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 7d3119a08c9..ab49a3e6c13 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -1,8 +1,29 @@
 # CLAPACK specific Linux ARM settings
 
-CXXFLAGS += -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-            -DHAVE_CLAPACK -I ../../tools/CLAPACK
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 
-LDFLAGS += -rdynamic
-LDLIBS += $(ATLASLIBS)
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
+
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 8636b43e38e..0227c300041 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -1,16 +1,35 @@
 # OpenBLAS specific Linux settings
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASROOT not defined.)
+endif
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -msse -msse2 -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
 endif
 
-CXXFLAGS += -msse -msse2 -pthread -rdynamic \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-            -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
 
-LDFLAGS += -rdynamic
-LDLIBS += $(OPENBLASLIBS)
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 682d62b5154..f1cdac8090d 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -1,16 +1,35 @@
 # OpenBLAS specific Linux ARM settings
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
+endif
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
 endif
 
-CXXFLAGS += -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-            -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
 
-LDFLAGS += -rdynamic
-LDLIBS += $(OPENBLASLIBS)
+RANLIB = ranlib
+AR = ar
+AS = as
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 5e93d393b3e..83b799356b9 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -9,14 +9,31 @@
 # Use the options obtained from this website to manually configure for other
 # platforms using MKL.
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 ifndef MKLROOT
 $(error MKLROOT not defined.)
 endif
 
 MKLLIB ?= $(MKLROOT)/lib/em64t
 
-CXXFLAGS += -m64 -msse -msse2 -pthread -rdynamic \
-            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL -I$(MKLROOT)/include
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL -I$(MKLROOT)/include \
+           -m64 -msse -msse2 -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
 
 ## Use the following for STATIC LINKING of the SEQUENTIAL version of MKL
 MKL_STA_SEQ = $(MKLLIB)/libmkl_solver_lp64_sequential.a -Wl,--start-group \
@@ -38,5 +55,9 @@ MKL_DYN_MUL = -L$(MKLLIB) -lmkl_solver_lp64 -Wl,--start-group -lmkl_intel_lp64 \
 
 # MKLFLAGS = $(MKL_DYN_MUL)
 
-LDFLAGS += -rdynamic
-LDLIBS += $(MKLFLAGS)
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(MKLFLAGS) -lm -lpthread -ldl
+
+RANLIB = ranlib
+AR = ar
+AS = as

From ac480ac18b23f821f634cd4d3d0883406b0a91b9 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 20 Dec 2016 23:19:33 -0800
Subject: [PATCH 094/213] Further changes to configure.

---
 src/Makefile  | 15 ++-------------
 src/configure | 15 +++++++--------
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index cecc8ca5170..fded748fbe5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -31,7 +31,7 @@ include kaldi.mk
 
 # Reset the default goal, so that the all target will become default
 .DEFAULT_GOAL :=
-all: checkversion test_dependencies kaldi.mk mklibdir $(SUBDIRS)
+all: checkversion kaldi.mk mklibdir $(SUBDIRS)
 	-echo Done
 
 mklibdir:
@@ -88,23 +88,12 @@ kaldi.mk:
 	@[ -f kaldi.mk ] || { echo "kaldi.mk does not exist; you have to run ./configure"; exit 1; }
 
 # Compile optional stuff
-ext: test_dependencies ext_depend $(SUBDIRS) $(EXT_SUBDIRS)
+ext: ext_depend $(SUBDIRS) $(EXT_SUBDIRS)
 	-echo Done
 
-ifndef OPENFST_VER
-$(error Please rerun configure: OPENFST_VER is not defined, likely kaldi.mk was produced by older configure script.)
-endif
-# Note: OPENFST_VER is determined by configure and added to kaldi.mk
-OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-test_dependencies:
-ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")
-	$(error OpenFst $(OPENFST_VER) is not supported. You now need OpenFst >= 1.5.3.)
-endif
-
 check_portaudio:
 	@[ -d ../tools/portaudio ] || ( cd ../tools;  ./install_portaudio.sh )
 
-
 clean: rmlibdir
 	-for x in $(SUBDIRS) $(EXT_SUBDIRS); do $(MAKE) -C $$x clean; done
 
diff --git a/src/configure b/src/configure
index b92729319a7..3e4db9a4712 100755
--- a/src/configure
+++ b/src/configure
@@ -908,9 +908,12 @@ if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
   failure "Could not find file $FSTROOT/include/fst/fst.h:
   you may not have installed OpenFst. See ../tools/INSTALL"
 fi
-echo "FSTROOT = $FSTROOT" >> kaldi.mk
-OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
-echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
+OPENFST_VER=$(grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::')
+OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+if [ $OPENFST_VER_NUM -lt 10503 ]; then
+  failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.5.3.)"
+fi
+echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk
 if  $static_fst ; then
   OPENFSTLIBS="$FSTROOT/lib/libfst.a"
 else
@@ -933,10 +936,6 @@ echo "OPENFSTLDFLAGS = $OPENFSTLDFLAGS" >> kaldi.mk
 echo "CXX = $CXX" >> kaldi.mk
 echo >> kaldi.mk
 
-# Add platform independent settings
-cat makefiles/common.mk >> kaldi.mk
-echo >> kaldi.mk
-
 # Most of the OS-specific steps below will append to kaldi.mk
 echo "Doing OS specific configurations ..."
 
@@ -1118,8 +1117,8 @@ elif [ "`uname`" == "Linux" ]; then
       echo "Configuring dynamically loaded OpenBlas since --static-math=no (the default)"
       OPENBLASLIBS="-L$OPENBLASROOT/lib -lopenblas -lgfortran -Wl,-rpath=$OPENBLASROOT/lib"
     fi
+    echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
-    echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
     if [[ "`uname -m`" == arm* ]]; then
       cat makefiles/linux_openblas_arm.mk >> kaldi.mk
     elif [[ "`uname -m`" == ppc64le ]]; then

From b37e06b82cd530b6f23aeb06fec26e29fd3af20b Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 20 Dec 2016 23:27:36 -0800
Subject: [PATCH 095/213] Configure script now accepts binary flags without the
 yes/no qualifiers as yes.

---
 src/configure | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/configure b/src/configure
index 3e4db9a4712..1131b5f069e 100755
--- a/src/configure
+++ b/src/configure
@@ -219,6 +219,9 @@ do
     static_math=false;
     static_fst=false;
     shift ;;
+  --double-precision)
+    double_precision=true;
+    shift ;;
   --double-precision=yes)
     double_precision=true;
     shift ;;
@@ -228,12 +231,19 @@ do
   --atlas-root=*)
     ATLASROOT=`read_dirname $1`;
     shift ;;
+  --threaded-atlas)
+    threaded_atlas=true;
+    shift ;;
   --threaded-atlas=yes)
     threaded_atlas=true;
     shift ;;
   --threaded-atlas=no)
     threaded_atlas=false;
     shift ;;
+  --threaded-math)
+    threaded_atlas=true;
+    mkl_threading=iomp
+    shift ;;
   --threaded-math=yes)
     threaded_atlas=true;
     mkl_threading=iomp
@@ -242,18 +252,27 @@ do
     threaded_atlas=false;
     mkl_threading=sequential
     shift ;;
+  --use-cuda)
+    use_cuda=true;
+    shift ;;
   --use-cuda=yes)
     use_cuda=true;
     shift ;;
   --use-cuda=no)
     use_cuda=false;
     shift ;;
+  --static-math)
+    static_math=true;
+    shift ;;
   --static-math=yes)
     static_math=true;
     shift ;;
   --static-math=no)
     static_math=false;
     shift ;;
+  --static-fst)
+    static_fst=true;
+    shift ;;
   --static-fst=yes)
     static_fst=true;
     shift ;;

From c38636752ec14726e0b4db75dfa48b8fd11f4c78 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 20 Dec 2016 23:45:48 -0800
Subject: [PATCH 096/213] Small cosmetic changes to platform specific
 makefiles.

---
 src/fstext/context-fst-inl.h        | 11 -----------
 src/makefiles/cuda_32bit.mk         |  6 +++---
 src/makefiles/cuda_64bit.mk         |  6 +++---
 src/makefiles/cygwin.mk             |  4 ++--
 src/makefiles/darwin.mk             |  2 +-
 src/makefiles/linux_atlas.mk        |  2 +-
 src/makefiles/linux_atlas_arm.mk    |  2 +-
 src/makefiles/linux_clapack.mk      |  2 +-
 src/makefiles/linux_clapack_arm.mk  |  2 +-
 src/makefiles/linux_openblas.mk     |  4 ++--
 src/makefiles/linux_openblas_arm.mk |  2 +-
 src/makefiles/linux_x86_64_mkl.mk   |  2 +-
 12 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
index 4427863d887..6fa8632cf67 100644
--- a/src/fstext/context-fst-inl.h
+++ b/src/fstext/context-fst-inl.h
@@ -338,17 +338,6 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
   this->SetArcs(s);  // mark the arcs as "done". [so HasArcs returns true].
 }
 
-
-// template<class Arc, class LabelT>
-// ContextFst<Arc, LabelT>::ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset) {
-//   if (reset) {
-//     impl_ = std::make_shared<ContextFstImpl<Arc, LabelT> >(*(fst.impl_));
-//   } else {
-//     impl_ = fst.impl_;
-//   }
-// }
-
-
 template<class Arc, class LabelT>
 bool ContextMatcher<Arc, LabelT>::Find(typename Arc::Label match_label) {
   assert(s_ != kNoStateId);
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
index 4019d5027b1..c6bba9669ea 100644
--- a/src/makefiles/cuda_32bit.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,9 +1,9 @@
-ifndef CUDATKDIR
-$(error CUDATKDIR not defined.)
-endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
+ifndef CUDATKDIR
+$(error CUDATKDIR not defined.)
+endif
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA \
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index 0ce7bacdd00..89696253c84 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,9 +1,9 @@
-ifndef CUDATKDIR
-$(error CUDATKDIR not defined.)
-endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
+ifndef CUDATKDIR
+$(error CUDATKDIR not defined.)
+endif
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index beaea294638..6cae548e3b2 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -13,7 +13,7 @@ endif
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-           -DHAVE_CLAPACK -I ../../tools/CLAPACK/ \
+           -DHAVE_CLAPACK -I../../tools/CLAPACK/ \
            -msse -msse2 \
            -g # -O0 -DKALDI_PARANOID
 
@@ -26,6 +26,6 @@ LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g \
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -lcyglapack-0 -lcygblas-0 \
          -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 2f1692018ac..e0570e43d55 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -24,9 +24,9 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
 
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 3fbeab9bed3..8ae3b46c92e 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -30,6 +30,6 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 0bbcbdd2acd..c20ebd2373c 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -30,6 +30,6 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 60fbf4918e3..a597bd14935 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -24,6 +24,6 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index ab49a3e6c13..5b60dc11e1a 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -24,6 +24,6 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 0227c300041..eaccd5d8646 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -10,7 +10,7 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 ifndef OPENBLASINC
-$(error OPENBLASROOT not defined.)
+$(error OPENBLASINC not defined.)
 endif
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
@@ -30,6 +30,6 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index f1cdac8090d..4e6e31aa715 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -30,6 +30,6 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 83b799356b9..dbd7d72a523 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -58,6 +58,6 @@ MKL_DYN_MUL = -L$(MKLLIB) -lmkl_solver_lp64 -Wl,--start-group -lmkl_intel_lp64 \
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(MKLFLAGS) -lm -lpthread -ldl
 
-RANLIB = ranlib
 AR = ar
 AS = as
+RANLIB = ranlib

From c03893f52c9a03150d804b2a472ac1ff77bf9d17 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 20 Dec 2016 23:55:02 -0800
Subject: [PATCH 097/213] Update installation instructions.

---
 src/INSTALL | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/INSTALL b/src/INSTALL
index 8decefe71c2..f40a514c4b6 100644
--- a/src/INSTALL
+++ b/src/INSTALL
@@ -23,8 +23,7 @@ e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system
 default compiler does not support C++11, you can specify a C++11 compliant
 compiler by setting the CXX environment variable, e.g.
 
-  make depend CXX=g++-4.8
-  make CXX=g++-4.8
+  CXX=g++-4.8 ./configure --shared
 
 For more information, see documentation at http://kaldi-asr.org/doc/
 and click on "The build process (how Kaldi is compiled)".

From e72541513491b8d94abc774cf681a7360e57214f Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 21 Dec 2016 12:22:39 -0800
Subject: [PATCH 098/213] Cosmetic fixes.

---
 src/configure               | 6 ++++--
 src/makefiles/cuda_32bit.mk | 2 ++
 src/makefiles/cuda_64bit.mk | 2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/configure b/src/configure
index 1131b5f069e..f9ad1cd8c07 100755
--- a/src/configure
+++ b/src/configure
@@ -510,7 +510,6 @@ function configure_cuda {
     fi
     echo "Using CUDA toolkit $CUDATKDIR (nvcc compiler and runtime libraries)"
     echo >> kaldi.mk
-    echo "#Next section enables CUDA for compilation" >> kaldi.mk
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
 
@@ -966,7 +965,7 @@ if [ "`uname`" == "Darwin"  ]; then
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
     failure "Need the Accelerate framework to compile on Darwin."
   fi
-  OSX_VER=`sw_vers | grep ProductVersion | awk '{print $2}' | awk '{split($0,a,"."); print a[1] "." a[2]; }'`
+  OSX_VER=$(sw_vers | grep ProductVersion | awk '{print $2}' | awk '{split($0,a,"."); print a[1] "." a[2]; }')
   OSX_VER_NUM=$(echo $OSX_VER | sed 's/\./ /g' | xargs printf "%d%02d")
   echo "Configuring for OS X version $OSX_VER ..."
   if [ $OSX_VER_NUM -ge 1005 ]; then
@@ -1018,6 +1017,7 @@ elif [ "`uname`" == "Linux" ]; then
     # containing {liblapack.a,libblas.a}, and linking against just these two
     # libraries worked.
 
+    echo >> kaldi.mk
     if $static_math; then
       # Prefer static to dynamic math.
       linux_configure_static || \
@@ -1088,6 +1088,7 @@ elif [ "`uname`" == "Linux" ]; then
     if [ ! -z $MKLLIBDIR ]; then
       echo MKLLIB = $MKLLIBDIR >> kaldi.mk
     fi
+    echo >> kaldi.mk
     check_exists makefiles/linux_x86_64_mkl.mk
     cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
@@ -1138,6 +1139,7 @@ elif [ "`uname`" == "Linux" ]; then
     fi
     echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
+    echo >> kaldi.mk
     if [[ "`uname -m`" == arm* ]]; then
       cat makefiles/linux_openblas_arm.mk >> kaldi.mk
     elif [[ "`uname -m`" == ppc64le ]]; then
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
index c6bba9669ea..84b8686e374 100644
--- a/src/makefiles/cuda_32bit.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,3 +1,5 @@
+# 32bit CUDA settings
+
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index 89696253c84..c47908e7323 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,3 +1,5 @@
+# 64bit CUDA settings
+
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif

From f5b98f0ed20bf9b26a0b010c1749567ce2a0c9bf Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 21 Dec 2016 12:30:47 -0800
Subject: [PATCH 099/213] More cosmetic fixes.

---
 src/configure | 60 +++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/src/configure b/src/configure
index f9ad1cd8c07..a9c11980812 100755
--- a/src/configure
+++ b/src/configure
@@ -588,30 +588,31 @@ function linux_configure_speex {
 }
 
 function linux_atlas_failure {
-   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-   echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
-   if [[ "`uname -m`" == arm* ]]; then
-     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-   elif [[ "`uname -m`" == ppc64le ]]; then
-     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-   else
-     cat makefiles/linux_atlas.mk >> kaldi.mk
-   fi
-   echo "** $* ***"
-   echo "**  ERROR   **"
-   echo "** Configure cannot proceed automatically."
-   echo "**  If you know that you have ATLAS installed somewhere on your machine, you"
-   echo "** may be able to proceed by replacing [somewhere] in kaldi.mk with a directory."
-   echo "**  If you have sudo (root) access you could install the ATLAS package on your"
-   echo "** machine, e.g. 'sudo apt-get install libatlas-dev libatlas-base-dev' or"
-   echo "** 'sudo yum install atlas.x86_64' or 'sudo zypper install libatlas3-devel',"
-   echo "** or on cygwin, install atlas from the installer GUI; and then run ./configure"
-   echo "** again."
-   echo "**"
-   echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
-   echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
-   echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
-   exit 1;
+  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
+  echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
+  echo >> kaldi.mk
+  if [[ "`uname -m`" == arm* ]]; then
+   cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "`uname -m`" == ppc64le ]]; then
+   cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
+  else
+   cat makefiles/linux_atlas.mk >> kaldi.mk
+  fi
+  echo "** $* ***"
+  echo "**  ERROR   **"
+  echo "** Configure cannot proceed automatically."
+  echo "**  If you know that you have ATLAS installed somewhere on your machine, you"
+  echo "** may be able to proceed by replacing [somewhere] in kaldi.mk with a directory."
+  echo "**  If you have sudo (root) access you could install the ATLAS package on your"
+  echo "** machine, e.g. 'sudo apt-get install libatlas-dev libatlas-base-dev' or"
+  echo "** 'sudo yum install atlas.x86_64' or 'sudo zypper install libatlas3-devel',"
+  echo "** or on cygwin, install atlas from the installer GUI; and then run ./configure"
+  echo "** again."
+  echo "**"
+  echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
+  echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
+  echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
+  exit 1;
 }
 
 function linux_check_static {
@@ -646,6 +647,7 @@ function linux_configure_debian_ubuntu {
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
    elif [[ "`uname -m`" == ppc64le ]]; then
@@ -669,6 +671,7 @@ function linux_configure_debian_ubuntu3 {
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "`uname -m`" == ppc64le ]]; then
@@ -694,7 +697,7 @@ function linux_configure_debian7 {
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_debian7" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo
+  echo >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "`uname -m`" == ppc64le ]]; then
@@ -717,7 +720,7 @@ function linux_configure_redhat {
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo
+  echo >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "`uname -m`" == ppc64le ]]; then
@@ -742,7 +745,7 @@ function linux_configure_redhat_fat {
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat_fat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo
+  echo >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "`uname -m`" == ppc64le ]]; then
@@ -799,6 +802,7 @@ function linux_configure_static {
 
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "`uname -m`" == ppc64le ]]; then
@@ -882,6 +886,7 @@ function linux_configure_dynamic {
 
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "`uname -m`" == ppc64le ]]; then
@@ -1017,7 +1022,6 @@ elif [ "`uname`" == "Linux" ]; then
     # containing {liblapack.a,libblas.a}, and linking against just these two
     # libraries worked.
 
-    echo >> kaldi.mk
     if $static_math; then
       # Prefer static to dynamic math.
       linux_configure_static || \

From 1a512aaa3ccf787f28563e8fab8b3e180cdb3117 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 21 Dec 2016 12:49:47 -0800
Subject: [PATCH 100/213] Further cosmetic fixes.

---
 src/configure                       | 9 ++++++---
 src/makefiles/cuda_32bit.mk         | 2 --
 src/makefiles/cuda_64bit.mk         | 2 --
 src/makefiles/cygwin.mk             | 2 +-
 src/makefiles/darwin.mk             | 2 +-
 src/makefiles/linux_atlas.mk        | 2 +-
 src/makefiles/linux_atlas_arm.mk    | 2 +-
 src/makefiles/linux_clapack.mk      | 2 +-
 src/makefiles/linux_clapack_arm.mk  | 2 +-
 src/makefiles/linux_openblas.mk     | 2 +-
 src/makefiles/linux_openblas_arm.mk | 2 +-
 src/makefiles/linux_x86_64_mkl.mk   | 2 +-
 12 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/configure b/src/configure
index a9c11980812..07d9bb0a319 100755
--- a/src/configure
+++ b/src/configure
@@ -510,6 +510,8 @@ function configure_cuda {
     fi
     echo "Using CUDA toolkit $CUDATKDIR (nvcc compiler and runtime libraries)"
     echo >> kaldi.mk
+    echo "# CUDA configuration" >> kaldi.mk
+    echo >> kaldi.mk
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
 
@@ -528,6 +530,7 @@ function configure_cuda {
       *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
     esac
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
+    echo >> kaldi.mk
 
     # 64bit/32bit?
     if [ "`uname -m`" == "x86_64" ]; then
@@ -903,7 +906,7 @@ echo "Configuring ..."
 
 # Back up the old kaldi.mk in case we modified it
 if [ -f kaldi.mk ]; then
-  echo "Backing up kaldi.mk to kaldi.mk.bak"
+  echo "Backing up kaldi.mk to kaldi.mk.bak ..."
   cp kaldi.mk kaldi.mk.bak
 fi
 
@@ -914,7 +917,7 @@ printf "# This file was generated using the following command:\n# $cmd_line\n\n"
 echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
 echo >> kaldi.mk
 
-echo "# Configuration" >> kaldi.mk
+echo "# Base configuration" >> kaldi.mk
 echo >> kaldi.mk
 if $dynamic_kaldi ; then
   KALDILIBDIR=`pwd`/lib
@@ -1167,7 +1170,7 @@ fi
 # Append the flags set by environment variables last so they can be used
 # to override the automatically generated configuration.
 echo >> kaldi.mk
-echo "# Environment settings" >> kaldi.mk
+echo "# Environment configuration" >> kaldi.mk
 echo >> kaldi.mk
 if [ -n "$ENV_CXXFLAGS" ]; then echo "CXXFLAGS += $ENV_CXXFLAGS" >> kaldi.mk; fi
 if [ -n "$ENV_LDFLAGS" ]; then echo "LDFLAGS += $ENV_LDFLAGS" >> kaldi.mk; fi
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
index 84b8686e374..c6bba9669ea 100644
--- a/src/makefiles/cuda_32bit.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,5 +1,3 @@
-# 32bit CUDA settings
-
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index c47908e7323..89696253c84 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,5 +1,3 @@
-# 64bit CUDA settings
-
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index 6cae548e3b2..14ece9d4ee7 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -1,4 +1,4 @@
-# Cygwin settings
+# Cygwin configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index e0570e43d55..5dbcd6f768b 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -1,4 +1,4 @@
-# Darwin (macOS) settings
+# Darwin (macOS) configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 8ae3b46c92e..9ab038295b6 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -1,4 +1,4 @@
-# ATLAS specific Linux settings
+# ATLAS specific Linux configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index c20ebd2373c..0dfc32863b4 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -1,4 +1,4 @@
-# ATLAS specific Linux ARM settings
+# ATLAS specific Linux ARM configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index a597bd14935..d8f8cf5668f 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -1,4 +1,4 @@
-# CLAPACK specific Linux settings
+# CLAPACK specific Linux configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 5b60dc11e1a..432bd689f55 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -1,4 +1,4 @@
-# CLAPACK specific Linux ARM settings
+# CLAPACK specific Linux ARM configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index eaccd5d8646..a859fc7e272 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -1,4 +1,4 @@
-# OpenBLAS specific Linux settings
+# OpenBLAS specific Linux configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 4e6e31aa715..00c4ae2bbdd 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -1,4 +1,4 @@
-# OpenBLAS specific Linux ARM settings
+# OpenBLAS specific Linux ARM configuration
 
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index dbd7d72a523..d2aee4a036f 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -1,4 +1,4 @@
-# MKL specific Linux settings
+# MKL specific Linux configuration
 
 # We have tested Kaldi with MKL version 10.2 on Linux/GCC and Intel(R) 64
 # architecture (also referred to as x86_64) with LP64 interface layer.

From 6992617e60fef951f9ce55c5a196c9c1c9af5f2b Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 21 Dec 2016 14:16:55 -0800
Subject: [PATCH 101/213] Syncronize ppc64le configuration.

---
 src/configure                           |  2 +-
 src/makefiles/cuda_ppc64le.mk           | 12 -------
 src/makefiles/linux_atlas_ppc64le.mk    | 37 +++++++++++----------
 src/makefiles/linux_openblas_ppc64le.mk | 43 ++++++++++++-------------
 4 files changed, 40 insertions(+), 54 deletions(-)
 delete mode 100644 src/makefiles/cuda_ppc64le.mk

diff --git a/src/configure b/src/configure
index 07d9bb0a319..8b68c97fd67 100755
--- a/src/configure
+++ b/src/configure
@@ -540,7 +540,7 @@ function configure_cuda {
         cat makefiles/cuda_64bit.mk >> kaldi.mk
       fi
     elif [ "`uname -m`" == "ppc64le" ]; then
-      cat makefiles/cuda_ppc64le.mk >> kaldi.mk
+      cat makefiles/cuda_64bit.mk >> kaldi.mk
     else
       cat makefiles/cuda_32bit.mk >> kaldi.mk
     fi
diff --git a/src/makefiles/cuda_ppc64le.mk b/src/makefiles/cuda_ppc64le.mk
deleted file mode 100644
index 3941de6a230..00000000000
--- a/src/makefiles/cuda_ppc64le.mk
+++ /dev/null
@@ -1,12 +0,0 @@
-
-ifndef DOUBLE_PRECISION
-$(error DOUBLE_PRECISION not defined.)
-endif
-
-
-CUDA_INCLUDE= -I$(CUDATKDIR)/include
-CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
-             -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
-CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index 234a3794721..aa121fc5cdc 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -1,37 +1,36 @@
-# You have to make sure ATLASLIBS is set...
+# ATLAS specific Linux ppc64le configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
 endif
-
 ifndef ATLASINC
 $(error ATLASINC not defined.)
 endif
-
 ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -m64 -maltivec -mcpu=power8 -Wall -I.. \
-	   -mtune=power8 -mpower8-vector -mvsx -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_ATLAS -I$(ATLASINC) \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
+           -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
+           -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
+
 AR = ar
 AS = as
 RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 222551f3bab..1e7a391dc79 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -1,37 +1,36 @@
-# You have to make sure FSTROOT,OPENBLASROOT,OPENBLASLIBS are set...
+# OpenBLAS specific Linux configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
 endif
-
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
-endif
-
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -m64 -maltivec -mcpu=power8 -Wall -I.. \
-           -mtune=power8 -mpower8-vector -mvsx -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
-      -I $(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
+           -pthread -rdynamic \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
+
 AR = ar
 AS = as
 RANLIB = ranlib

From fed2288644bcfd42421ad3720fb4eb1222bda0e5 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 21 Dec 2016 14:17:18 -0800
Subject: [PATCH 102/213] Update .gitignore.

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 16d03d4a193..05f3cc39fe8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,6 +88,8 @@ GSYMS
 /tools/openfst-1.3.4/
 /tools/openfst-1.4.1.tar.gz
 /tools/openfst-1.4.1/
+/tools/openfst-1.5.4.tar.gz
+/tools/openfst-1.5.4/
 /tools/pa_stable_v19_20111121.tgz
 /tools/portaudio/
 /tools/sctk-2.4.0-20091110-0958.tar.bz2

From f31024ebf9f39c01cb0332417547b4385d819b4c Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 22 Dec 2016 21:24:55 -0800
Subject: [PATCH 103/213] Remove a few include guards that are no longer
 needed.

---
 src/fstext/determinize-star-inl.h |  8 --------
 src/util/stl-utils.h              | 13 -------------
 tools/Makefile                    |  6 +-----
 3 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index ea599008e56..b9eaa485350 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -24,16 +24,8 @@
 
 #include "base/kaldi-error.h"
 
-#ifdef _MSC_VER
 #include <unordered_map>
 using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
 
 #include <vector>
 #include <climits>
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index b5f8f246d95..95ca0b03c5a 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -20,22 +20,10 @@
 #ifndef KALDI_UTIL_STL_UTILS_H_
 #define KALDI_UTIL_STL_UTILS_H_
 
-#ifdef _MSC_VER
 #include <unordered_map>
 #include <unordered_set>
 using std::unordered_map;
 using std::unordered_set;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-#include <unordered_set>
-using std::unordered_map;
-using std::unordered_set;
-#else
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
-using std::tr1::unordered_map;
-using std::tr1::unordered_set;
-#endif
 
 #include <algorithm>
 #include <map>
@@ -329,4 +317,3 @@ inline void MergePairVectorSumming(std::vector<std::pair<I, F> > *vec) {
 }  // namespace kaldi
 
 #endif  // KALDI_UTIL_STL_UTILS_H_
-
diff --git a/tools/Makefile b/tools/Makefile
index 772f8c18398..8ca95ac95ff 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -72,10 +72,6 @@ ifeq ($(OSTYPE),cygwin)
 else ifeq ($(OS),Windows_NT)
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 else
-	# ppc64le needs the newsted config.guess to be correctly indentified
-	[ "$(shell uname -p)" = "ppc64le" ] && wget -O openfst-$(OPENFST_VERSION)/config.guess \
-		"http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD" || \
-		echo "config.guess unchanged"
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 endif
 
@@ -83,7 +79,7 @@ openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
 	tar xozf openfst-$(OPENFST_VERSION).tar.gz
 
 openfst-$(OPENFST_VERSION).tar.gz:
-	wget --tries=1 -T 5 http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
+	wget -T 10 -t 1 http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
 	wget -T 10 -t 3 http://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz
 
 sclite: sclite_compiled

From d037234ff292382ea00f241caa3caf9f115f1523 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sun, 8 Jan 2017 20:07:11 -0800
Subject: [PATCH 104/213] Upgrade codebase to support OpenFst-1.6.0.

---
 .gitignore                            |   2 +
 src/base/kaldi-utils.h                |   4 +-
 src/chain/chain-den-graph.cc          |   2 +-
 src/configure                         |   4 +-
 src/fstext/context-fst-inl.h          |   3 +
 src/fstext/context-fst.h              |   8 +-
 src/fstext/determinize-lattice-inl.h  |   4 +-
 src/fstext/determinize-star-inl.h     |   4 +-
 src/fstext/fstext-utils.h             |   2 +-
 src/fstext/lattice-weight.h           |   6 ++
 src/fstext/trivial-factor-weight.h    |   7 +-
 src/gmmbin/gmm-adapt-map.cc           |  32 +++----
 src/hmm/transition-model.h            |   2 +-
 src/lat/determinize-lattice-pruned.cc | 116 +++++++++++++-------------
 src/nnet3bin/nnet3-average.cc         |  15 ++--
 tools/Makefile                        |   2 +-
 16 files changed, 113 insertions(+), 100 deletions(-)

diff --git a/.gitignore b/.gitignore
index 05f3cc39fe8..e6d9c0fd612 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,6 +90,8 @@ GSYMS
 /tools/openfst-1.4.1/
 /tools/openfst-1.5.4.tar.gz
 /tools/openfst-1.5.4/
+/tools/openfst-1.6.0.tar.gz
+/tools/openfst-1.6.0/
 /tools/pa_stable_v19_20111121.tgz
 /tools/portaudio/
 /tools/sctk-2.4.0-20091110-0958.tar.bz2
diff --git a/src/base/kaldi-utils.h b/src/base/kaldi-utils.h
index 47c60b4b01d..2cfecdcc7db 100644
--- a/src/base/kaldi-utils.h
+++ b/src/base/kaldi-utils.h
@@ -113,8 +113,7 @@ void Sleep(float seconds);
           (reinterpret_cast<char*>(&a))[1]=t;}
 
 
-// Makes copy constructor and operator= private.  Same as in compat.h of OpenFst
-// toolkit.
+// Makes copy constructor and operator= private.
 #define KALDI_DISALLOW_COPY_AND_ASSIGN(type)    \
   type(const type&);                  \
   void operator = (const type&)
@@ -156,4 +155,3 @@ template<> class KaldiCompileTimeAssert<true> {
 #endif
 
 #endif  // KALDI_BASE_KALDI_UTILS_H_
-
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index 6f494a0c562..5386f959b1f 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -186,7 +186,7 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) {
   fst::EncodeMapper<fst::StdArc> encoder(fst::kEncodeLabels | fst::kEncodeWeights,
                                          fst::ENCODE);
   fst::Encode(fst, &encoder);
-  fst::AcceptorMinimize(fst);
+  fst::internal::AcceptorMinimize(fst);
   fst::Decode(fst, encoder);
 }
 
diff --git a/src/configure b/src/configure
index 8b68c97fd67..3388d8ebd50 100755
--- a/src/configure
+++ b/src/configure
@@ -936,8 +936,8 @@ if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
 fi
 OPENFST_VER=$(grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::')
 OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-if [ $OPENFST_VER_NUM -lt 10503 ]; then
-  failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.5.3.)"
+if [ $OPENFST_VER_NUM -lt 10600 ]; then
+  failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)"
 fi
 echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk
 if  $static_fst ; then
diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
index 6fa8632cf67..dc8a4a8370b 100644
--- a/src/fstext/context-fst-inl.h
+++ b/src/fstext/context-fst-inl.h
@@ -31,6 +31,7 @@ namespace fst {
 /// \addtogroup context_fst_group
 /// @{
 
+namespace internal {
 
 template<class Arc, class LabelT>
 typename ContextFstImpl<Arc, LabelT>::StateId
@@ -338,6 +339,8 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
   this->SetArcs(s);  // mark the arcs as "done". [so HasArcs returns true].
 }
 
+}  // namespace internal
+
 template<class Arc, class LabelT>
 bool ContextMatcher<Arc, LabelT>::Find(typename Arc::Label match_label) {
   assert(s_ != kNoStateId);
diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h
index 7a00b7ed2f1..246dce924b2 100644
--- a/src/fstext/context-fst.h
+++ b/src/fstext/context-fst.h
@@ -64,11 +64,12 @@ namespace fst {
 /// \addtogroup context_fst_group "Classes and functions related to context expansion"
 /// @{
 
+namespace internal {
+
 /*
    ContextFstImpl inherits from CacheImpl, which handles caching of states.
 */
 
-
 template <class Arc,
           class LabelT = int32> // make the vector<Label> things actually vector<int32> for
                                 // easier compatibility with Kaldi code.
@@ -182,6 +183,7 @@ class ContextFstImpl : public CacheImpl<Arc> {
   std::string separator_;
 };
 
+}  // namespace internal
 
 /*
    Actual FST for ContextFst.  Most of the work gets done in ContextFstImpl.
@@ -202,7 +204,7 @@ class ContextFstImpl : public CacheImpl<Arc> {
 template <class Arc,
           class LabelT = int32> // make the vector<LabelT> things actually vector<int32> for
                                 // easier compatibility with Kaldi code.
-class ContextFst : public ImplToFst<ContextFstImpl<Arc, LabelT>> {
+class ContextFst : public ImplToFst<internal::ContextFstImpl<Arc, LabelT>> {
  public:
   friend class ArcIterator<ContextFst<Arc>>;
   friend class StateIterator<ContextFst<Arc>>;
@@ -212,7 +214,7 @@ class ContextFst : public ImplToFst<ContextFstImpl<Arc, LabelT>> {
   typedef typename Arc::StateId StateId;
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-  typedef ContextFstImpl<Arc, LabelT> Impl;
+  typedef internal::ContextFstImpl<Arc, LabelT> Impl;
 
   /// See \ref graph_context for more details.
   ContextFst(Label subsequential_symbol,  // epsilon not allowed.
diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h
index 9aff3e774a4..43ad809f70e 100644
--- a/src/fstext/determinize-lattice-inl.h
+++ b/src/fstext/determinize-lattice-inl.h
@@ -235,7 +235,7 @@ template<class IntType> class LatticeStringRepository {
     }
   }
 
-  DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository);
   Entry *new_entry_; // We always have a pre-allocated Entry ready to use,
                      // to avoid unnecessary news and deletes.
   SetType set_;
@@ -1210,7 +1210,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     }
   }
 
-  DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizer);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizer);
 
 
   vector<vector<Element>* > output_states_; // maps from output state to
diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index b9eaa485350..fcba5ef8ea2 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -129,7 +129,7 @@ template<class Label, class StringId> class StringRepository {
   }
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(StringRepository);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(StringRepository);
 
   StringId IdOfSeqInternal(const vector<Label> &v) {
     typename MapType::iterator iter = map_.find(&v);
@@ -601,7 +601,7 @@ template<class F> class DeterminizerStar {
 
   void Debug();
 
-  DISALLOW_COPY_AND_ASSIGN(DeterminizerStar);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DeterminizerStar);
   deque<pair<vector<Element>*, OutputStateId> > Q_;  // queue of subsets to be processed.
 
   vector<vector<TempArc> > output_arcs_;  // essentially an FST in our format.
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index e06207a111e..a9038858eda 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -116,7 +116,7 @@ void MinimizeEncoded(VectorFst<Arc> *fst, float delta = kDelta) {
   Map(fst, QuantizeMapper<Arc>(delta));
   EncodeMapper<Arc> encoder(kEncodeLabels | kEncodeWeights, ENCODE);
   Encode(fst, &encoder);
-  AcceptorMinimize(fst);
+  internal::AcceptorMinimize(fst);
   Decode(fst, encoder);
 }
 
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index 3a03733cb3d..64e10682ec9 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -320,6 +320,9 @@ template<class FloatType>
 class NaturalLess<LatticeWeightTpl<FloatType> > {
  public:
   typedef LatticeWeightTpl<FloatType> Weight;
+
+  NaturalLess() {}
+
   bool operator()(const Weight &w1, const Weight &w2) const {
     // NaturalLess is a negative order (opposite to normal ordering).
     // This operator () corresponds to "<" in the negative order, which
@@ -579,6 +582,9 @@ template<class FloatType, class IntType>
 class NaturalLess<CompactLatticeWeightTpl<LatticeWeightTpl<FloatType>, IntType> > {
  public:
   typedef CompactLatticeWeightTpl<LatticeWeightTpl<FloatType>, IntType> Weight;
+
+  NaturalLess() {}
+
   bool operator()(const Weight &w1, const Weight &w2) const {
     // NaturalLess is a negative order (opposite to normal ordering).
     // This operator () corresponds to "<" in the negative order, which
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index f17ba4e2187..43d72729c08 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -85,6 +85,7 @@ struct TrivialFactorWeightOptions : CacheOptions {
 
 };
 
+namespace internal {
 
 // Implementation class for TrivialFactorWeight
 template <class A, class F>
@@ -302,6 +303,7 @@ class TrivialFactorWeightFstImpl
 
 };
 
+}  // namespace internal
 
 /// TrivialFactorWeightFst takes as template parameter a FactorIterator as
 /// defined above. The result of weight factoring is a transducer
@@ -320,7 +322,8 @@ class TrivialFactorWeightFstImpl
 
 
 template <class A, class F>
-class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F> > {
+class TrivialFactorWeightFst :
+    public ImplToFst<internal::TrivialFactorWeightFstImpl<A, F>> {
  public:
   friend class ArcIterator< TrivialFactorWeightFst<A, F> >;
   friend class StateIterator< TrivialFactorWeightFst<A, F> >;
@@ -330,7 +333,7 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
   typedef typename A::StateId StateId;
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-  typedef TrivialFactorWeightFstImpl<A, F> Impl;
+  typedef internal::TrivialFactorWeightFstImpl<A, F> Impl;
 
   explicit TrivialFactorWeightFst(const Fst<A> &fst)
       : ImplToFst<Impl>(std::make_shared<Impl>(fst, TrivialFactorWeightOptions<A>())) {}
diff --git a/src/gmmbin/gmm-adapt-map.cc b/src/gmmbin/gmm-adapt-map.cc
index bc0bac9f6cc..ec3eb8cea9b 100644
--- a/src/gmmbin/gmm-adapt-map.cc
+++ b/src/gmmbin/gmm-adapt-map.cc
@@ -40,20 +40,20 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: gmm-adapt-map  [options] <model-in> <feature-rspecifier> "
         "<posteriors-rspecifier> <map-am-wspecifier>\n";
-    
+
     ParseOptions po(usage);
-    string spk2utt_rspecifier;
+    std::string spk2utt_rspecifier;
     bool binary = true;
     MapDiagGmmOptions map_config;
     std::string update_flags_str = "mw";
-        
+
     po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
                 "utterance-list map");
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("update-flags", &update_flags_str, "Which GMM parameters will be "
                 "updated: subset of mvw.");
     map_config.Register(&po);
-        
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
         map_am_wspecifier = po.GetArg(4);
 
     GmmFlagsType update_flags = StringToGmmFlags(update_flags_str);
-    
+
     RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
     MapAmDiagGmmWriter map_am_writer(map_am_wspecifier);
 
@@ -83,7 +83,7 @@ int main(int argc, char *argv[]) {
     double tot_like = 0.0, tot_like_change = 0.0, tot_t = 0.0,
         tot_t_check = 0.0;
     int32 num_done = 0, num_err = 0;
-    
+
     if (spk2utt_rspecifier != "") {  // per-speaker adaptation
       SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
       RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
@@ -93,9 +93,9 @@ int main(int argc, char *argv[]) {
         copy_am_gmm.CopyFromAmDiagGmm(am_gmm);
         AccumAmDiagGmm map_accs;
         map_accs.Init(am_gmm, update_flags);
-        
+
         const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-        
+
         // for each speaker, estimate MAP means
         std::vector<std::string>::const_iterator iter = uttlist.begin(),
             end = uttlist.end();
@@ -124,8 +124,8 @@ int main(int argc, char *argv[]) {
           ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
           for ( size_t i = 0; i < posterior.size(); i++ ) {
             for ( size_t j = 0; j < pdf_posterior[i].size(); j++ ) {
-              int32 pdf_id = pdf_posterior[i][j].first; 
-              BaseFloat weight = pdf_posterior[i][j].second; 
+              int32 pdf_id = pdf_posterior[i][j].first;
+              BaseFloat weight = pdf_posterior[i][j].second;
               file_like += map_accs.AccumulateForGmm(copy_am_gmm,
                                                      feats.Row(i),
                                                      pdf_id, weight);
@@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
 
           KALDI_VLOG(2) << "Average like for utterance " << utt << " is "
                         << (file_like/file_t) << " over " << file_t << " frames.";
-          
+
           tot_like += file_like;
           tot_t += file_t;
           num_done++;
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
             KALDI_VLOG(1) << "Avg like per frame so far is "
                           << (tot_like / tot_t);
         }  // end looping over all utterances of the current speaker
- 
+
         // MAP estimation.
         BaseFloat spk_objf_change = 0.0, spk_frames = 0.0;
         MapAmDiagGmmUpdate(map_config, map_accs, update_flags, &copy_am_gmm,
@@ -154,7 +154,7 @@ int main(int argc, char *argv[]) {
                   << " over " << spk_frames << " frames.";
         tot_like_change += spk_objf_change;
         tot_t_check += spk_frames;
-        
+
         // Writing AM for each speaker in a table
         map_am_writer.Write(spk,copy_am_gmm);
       }  // end looping over speakers
@@ -201,9 +201,9 @@ int main(int argc, char *argv[]) {
         tot_like += file_like;
         tot_t += file_t;
         if ( num_done % 10 == 0 )
-          KALDI_VLOG(1) << "Avg like per frame so far is " 
+          KALDI_VLOG(1) << "Avg like per frame so far is "
                         << (tot_like / tot_t);
-                
+
         // MAP
         BaseFloat utt_objf_change = 0.0, utt_frames = 0.0;
         MapAmDiagGmmUpdate(map_config, map_accs, update_flags, &copy_am_gmm,
@@ -213,7 +213,7 @@ int main(int argc, char *argv[]) {
                   << " over " << utt_frames << " frames.";
         tot_like_change += utt_objf_change;
         tot_t_check += utt_frames;
-        
+
         // Writing AM for each utterance in a table
         map_am_writer.Write(feature_reader.Key(), copy_am_gmm);
       }
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index 33a0d55443e..442de8fd2e0 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -317,7 +317,7 @@ class TransitionModel {
   int32 num_pdfs_;
 
 
-  DISALLOW_COPY_AND_ASSIGN(TransitionModel);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel);
 
 };
 
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index e38c62b3bfa..8c790e749a3 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -48,8 +48,8 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
   typedef CompactLatticeWeightTpl<Weight, IntType> CompactWeight;
   typedef ArcTpl<CompactWeight> CompactArc; // arc in compact, acceptor form of lattice
-  typedef ArcTpl<Weight> Arc; // arc in non-compact version of lattice 
-  
+  typedef ArcTpl<Weight> Arc; // arc in non-compact version of lattice
+
   // Output to standard FST with CompactWeightTpl<Weight> as its weight type (the
   // weight stores the original output-symbol strings).  If destroy == true,
   // release memory as we go (but we cannot output again).
@@ -123,7 +123,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     for (OutputStateId this_state_id = 0; this_state_id < nStates; this_state_id++) {
       OutputState &this_state = *(output_states_[this_state_id]);
       vector<TempArc> &this_vec(this_state.arcs);
-      
+
       typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
       for (; iter != end; ++iter) {
         const TempArc &temp_arc(*iter);
@@ -209,12 +209,12 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       ifst_ = NULL;
     }
     { MinimalSubsetHash tmp; tmp.swap(minimal_hash_); }
-    
+
     for (size_t i = 0; i < output_states_.size(); i++) {
       vector<Element> empty_subset;
       empty_subset.swap(output_states_[i]->minimal_subset);
     }
-    
+
     for (typename InitialSubsetHash::iterator iter = initial_hash_.begin();
          iter != initial_hash_.end(); ++iter)
       delete iter->first;
@@ -235,14 +235,14 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     { vector<pair<Label, Element> > tmp; tmp.swap(all_elems_tmp_); }
   }
-  
+
   ~LatticeDeterminizerPruned() {
     FreeMostMemory();
     FreeOutputStates();
     // rest is deleted by destructors.
   }
-  
-  void RebuildRepository() { // rebuild the string repository,    
+
+  void RebuildRepository() { // rebuild the string repository,
     // freeing stuff we don't need.. we call this when memory usage
     // passes a supplied threshold.  We need to accumulate all the
     // strings we need the repository to "remember", then tell it
@@ -281,10 +281,10 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                                      needed_strings.end()),
                          needed_strings.end()); // uniq the strings.
     KALDI_LOG << "Rebuilding repository.";
-    
+
     repository_.Rebuild(needed_strings);
   }
-  
+
   bool CheckMemoryUsage() {
     int32 repo_size = repository_.MemSize(),
         arcs_size = num_arcs_ * sizeof(TempArc),
@@ -299,7 +299,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
       KALDI_VLOG(2) << "Rebuilt repository in determinize-lattice: repository shrank from "
                     << repo_size << " to " << new_repo_size << " bytes (approximately)";
-      
+
       if (new_total_size > static_cast<int32>(opts_.max_mem * 0.8)) {
         // Rebuilding didn't help enough-- we need a margin to stop
         // having to rebuild too often.  We'll just return to the user at
@@ -325,7 +325,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     return true;
   }
-  
+
   bool Determinize(double *effective_beam) {
     KALDI_ASSERT(!determinized_);
     // This determinizes the input fst but leaves it in the "special format"
@@ -344,13 +344,13 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       // memory passed a user-specified threshold and cleanup failed
       //  to get it below that threshold.
       size_t num_states = output_states_.size();
-      if ((opts_.max_states > 0 && num_states > opts_.max_states) || 
-          (opts_.max_arcs > 0 && num_arcs_ > opts_.max_arcs) || 
+      if ((opts_.max_states > 0 && num_states > opts_.max_states) ||
+          (opts_.max_arcs > 0 && num_arcs_ > opts_.max_arcs) ||
           (num_states % 10 == 0 && !CheckMemoryUsage())) { // note: at some point
         // it was num_states % 100, not num_states % 10, but I encountered an example
         // where memory was exhausted before we reached state #100.
         KALDI_VLOG(1) << "Lattice determinization terminated but not "
-                      << " because of lattice-beam.  (#states, #arcs) is ( " 
+                      << " because of lattice-beam.  (#states, #arcs) is ( "
                       << output_states_.size() << ", " << num_arcs_
                       << " ), versus limits ( " << opts_.max_states << ", "
                       << opts_.max_arcs << " ) (else, may be memory limit).";
@@ -376,7 +376,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // arc or state limit.
   }
  private:
-  
+
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;  // use this when we don't know if it's input or output.
   typedef typename Arc::StateId InputStateId;  // state in the input FST.
@@ -493,7 +493,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // these types are the same anyway].
   typedef unordered_map<const vector<Element>*, Element,
                         SubsetKey, SubsetEqual> InitialSubsetHash;
-  
+
 
   // converts the representation of the subset from canonical (all states) to
   // minimal (only states with output symbols on arcs leaving them, and final
@@ -511,7 +511,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     subset->resize(cur_out - subset->begin());
   }
-  
+
   // Takes a minimal, normalized subset, and converts it to an OutputStateId.
   // Involves a hash lookup, and possibly adding a new OutputStateId.
   // If it creates a new OutputStateId, it creates a new record for it, works
@@ -546,7 +546,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     return state_id;
   }
 
-  
+
   // Given a normalized initial subset of elements (i.e. before epsilon closure),
   // compute the corresponding output-state.
   OutputStateId InitialToStateId(const vector<Element> &subset_in,
@@ -573,7 +573,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     ConvertToMinimal(&subset); // remove all but emitting and final states.
 
     Element elem; // will be used to store remaining weight and string, and
-                 // OutputStateId, in initial_hash_;    
+                 // OutputStateId, in initial_hash_;
     NormalizeSubset(&subset, &elem.weight, &elem.string); // normalize subset; put
     // common string and weight in "elem".  The subset is now a minimal,
     // normalized subset.
@@ -584,7 +584,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     *common_prefix = elem.string;
     if (elem.weight == Weight::Zero())
       KALDI_WARN << "Zero weight!";
-    
+
     // Before returning "ans", add the initial subset to the hash,
     // so that we can bypass the epsilon-closure etc., next time
     // we process the same initial subset.
@@ -634,7 +634,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // at input, subset must have only one example of each StateId.  [will still
     // be so at output].  This function follows input-epsilons, and augments the
     // subset accordingly.
-    
+
     std::priority_queue<Element, vector<Element>, greater<Element> > queue;
     unordered_map<InputStateId, Element> cur_subset;
     typedef typename unordered_map<InputStateId, Element>::iterator MapIter;
@@ -653,7 +653,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     while (queue.size() != 0) {
       Element elem = queue.top();
       queue.pop();
-      
+
       // The next if-statement is a kind of optimization.  It's to prevent us
       // unnecessarily repeating the processing of a state.  "cur_subset" always
       // contains only one Element with a particular state.  The issue is that
@@ -678,8 +678,8 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
           next_elem.state = arc.nextstate;
           next_elem.weight = Times(elem.weight, arc.weight);
           // next_elem.string is not set up yet... create it only
-          // when we know we need it (this is an optimization) 
-          
+          // when we know we need it (this is an optimization)
+
           MapIter iter = cur_subset.find(next_elem.state);
           if (iter == cur_subset.end()) {
             // was no such StateId: insert and add to queue.
@@ -695,10 +695,10 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
             if (comp == 0) { // A tie on weights.  This should be a rare case;
                              // we don't optimize for it.
               next_elem.string = (arc.olabel == 0 ? elem.string :
-                                  repository_.Successor(elem.string, 
+                                  repository_.Successor(elem.string,
                                                         arc.olabel));
               comp = Compare(next_elem.weight, next_elem.string,
-                             iter->second.weight, iter->second.string);              
+                             iter->second.weight, iter->second.string);
             }
             if(comp == 1) { // next_elem is better, so use its (weight, string)
               next_elem.string = (arc.olabel == 0 ? elem.string :
@@ -766,7 +766,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       temp_arc.string = final_string;
       temp_arc.weight = final_weight;
       state.arcs.push_back(temp_arc);
-      num_arcs_++;      
+      num_arcs_++;
     }
   }
 
@@ -808,11 +808,11 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // (weight, string) pair in the semiring).
   void MakeSubsetUnique(vector<Element> *subset) {
     typedef typename vector<Element>::iterator IterType;
-    
+
     // This KALDI_ASSERT is designed to fail (usually) if the subset is not sorted on
     // state.
     KALDI_ASSERT(subset->size() < 2 || (*subset)[0].state <= (*subset)[1].state);
-    
+
     IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
     size_t num_out = 0;
     // Merge elements with same state-id
@@ -835,7 +835,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     subset->resize(num_out);
   }
-  
+
   // ProcessTransition was called from "ProcessTransitions" in the non-pruned
   // code, but now we in effect put the calls to ProcessTransition on a priority
   // queue, and it now gets called directly from Determinize().  This function
@@ -850,7 +850,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     Weight tot_weight;
     NormalizeSubset(subset, &tot_weight, &common_str);
     forward_cost += ConvertToCost(tot_weight);
-     
+
     OutputStateId nextstate;
     {
       Weight next_tot_weight;
@@ -876,7 +876,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
 
   // "less than" operator for pair<Label, Element>.   Used in ProcessTransitions.
-  // Lexicographical order, which only compares the state when ordering the 
+  // Lexicographical order, which only compares the state when ordering the
   // "Element" member of the pair.
 
   class PairComparator {
@@ -898,7 +898,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // states.  Partitions the emitting transitions up by ilabel (by sorting on
   // ilabel), and for each unique ilabel, it creates a Task record that contains
   // the information we need to process the transition.
-  
+
   void ProcessTransitions(OutputStateId output_state_id) {
     const vector<Element> &minimal_subset = output_states_[output_state_id]->minimal_subset;
     // it's possible that minimal_subset could be empty if there are
@@ -922,7 +922,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
             next_elem.weight = Times(elem.weight, arc.weight);
             if (arc.olabel == 0) // output epsilon
               next_elem.string = elem.string;
-            else 
+            else
               next_elem.string = repository_.Successor(elem.string, arc.olabel);
             all_elems.push_back(this_pr);
           }
@@ -953,7 +953,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                                        backward_costs_[element.state]);
         cur++;
       }
-      
+
       // After the command below, the "priority_cost" is a value comparable to
       // the total-weight of the input FST, like a total-path weight... of
       // course, it will typically be less (in the semiring) than that.
@@ -965,7 +965,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
         delete task;
       } else {
         MakeSubsetUnique(&(task->subset)); // remove duplicate Elements with the same state.
-        queue_.push(task); // Push the task onto the queue.  The queue keeps it      
+        queue_.push(task); // Push the task onto the queue.  The queue keeps it
         // in prioritized order, so we always process the one with the "best"
         // weight (highest in the semiring).
 
@@ -983,7 +983,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // empty.
   }
 
-  
+
   bool IsIsymbolOrFinal(InputStateId state) { // returns true if this state
     // of the input FST either is final or has an osymbol on an arc out of it.
     // Uses the vector isymbol_or_final_ as a cache for this info.
@@ -1029,13 +1029,13 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
     if (ifst_->Start() == kNoStateId) return; // we'll be returning
     // an empty FST.
-    
+
     double best_cost = backward_costs_[ifst_->Start()];
     if (best_cost == numeric_limits<double>::infinity())
       KALDI_WARN << "Total weight of input lattice is zero.";
     cutoff_ = best_cost + beam_;
   }
-  
+
   void InitializeDeterminization() {
     // We insist that the input lattice be topologically sorted.  This is not a
     // fundamental limitation of the algorithm (which in principle should be
@@ -1088,8 +1088,8 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       // the queue, which we'll start processing in Determinize().
     }
   }
-  
-  DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizerPruned);
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizerPruned);
 
   struct OutputState {
     vector<Element> minimal_subset;
@@ -1106,23 +1106,23 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                 double forward_cost): minimal_subset(minimal_subset),
                                       forward_cost(forward_cost) { }
   };
-  
+
   vector<OutputState*> output_states_; // All the info about the output states.
-  
+
   int num_arcs_; // keep track of memory usage: number of arcs in output_states_[ ]->arcs
   int num_elems_; // keep track of memory usage: number of elems in output_states_ and
   // the keys of initial_hash_
-  
+
   const ExpandedFst<Arc> *ifst_;
   std::vector<double> backward_costs_; // This vector stores, for every state in ifst_,
   // the minimal cost to the end-state (i.e. the sum of weights; they are guaranteed to
   // have "take-the-minimum" semantics).  We get the double from the ConvertToCost()
   // function on the lattice weights.
-  
+
   double beam_;
   double cutoff_; // beam plus total-weight of input (and note, the weight is
   // guaranteed to be "tropical-like" so the sum does represent a min-cost.
-  
+
   DeterminizeLatticePrunedOptions opts_;
   SubsetKey hasher_;  // object that computes keys-- has no data members.
   SubsetEqual equal_;  // object that compares subsets-- only data member is delta_.
@@ -1141,7 +1141,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                                      // normalize, there may be an extra weight
                                      // and string.  Owns the pointers
                                      // in its keys.
-  
+
   struct Task {
     OutputStateId state; // State from which we're processing the transition.
     Label label; // Label on the transition we're processing out of this state.
@@ -1164,15 +1164,15 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // order according to the best weight of any path passing through these
   // determinized states... it's possible to work this out.
   std::priority_queue<Task*, vector<Task*>, TaskCompare> queue_;
-  
+
   vector<pair<Label, Element> > all_elems_tmp_; // temporary vector used in ProcessTransitions.
-  
+
   enum IsymbolOrFinal { OSF_UNKNOWN = 0, OSF_NO = 1, OSF_YES = 2 };
-  
+
   vector<char> isymbol_or_final_; // A kind of cache; it says whether
   // each state is (emitting or final) where emitting means it has at least one
   // non-epsilon output arc.  Only accessed by IsIsymbolOrFinal()
-  
+
   LatticeStringRepository<IntType> repository_;  // defines a compact and fast way of
   // storing sequences of labels.
 
@@ -1300,7 +1300,7 @@ typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
 
   // Work out the first phone symbol. This is more related to the phone
   // insertion function, so we put it here and make it the returning value of
-  // DeterminizeLatticeInsertPhones(). 
+  // DeterminizeLatticeInsertPhones().
   Label first_phone_label = HighestNumberedInputSymbol(*fst) + 1;
 
   // Insert phones here.
@@ -1373,7 +1373,7 @@ void DeterminizeLatticeDeletePhones(
 template
 void DeterminizeLatticeDeletePhones(
     ArcTpl<kaldi::LatticeWeight>::Label first_phone_label,
-    MutableFst<ArcTpl<kaldi::LatticeWeight> > *fst);    
+    MutableFst<ArcTpl<kaldi::LatticeWeight> > *fst);
 
 /** This function does a first pass determinization with phone symbols inserted
     at phone boundary. It uses a transition model to work out the transition-id
@@ -1396,7 +1396,7 @@ bool DeterminizeLatticePhonePrunedFirstPass(
   typename ArcTpl<Weight>::Label first_phone_label =
       DeterminizeLatticeInsertPhones(trans_model, fst);
   TopSort(fst);
-  
+
   // Second, do determinization with phone inserted.
   bool ans = DeterminizeLatticePruned<Weight>(*fst, beam, fst, opts);
 
@@ -1438,7 +1438,7 @@ bool DeterminizeLatticePhonePruned(
   // lattices.
   if (opts.phone_determinize) {
     KALDI_VLOG(1) << "Doing first pass of determinization on phone + word "
-                  << "lattices."; 
+                  << "lattices.";
     ans = DeterminizeLatticePhonePrunedFirstPass<Weight, IntType>(
         trans_model, beam, ifst, det_opts) && ans;
 
@@ -1513,14 +1513,14 @@ template
 bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
-    MutableFst<kaldi::CompactLatticeArc> *ofst, 
+    MutableFst<kaldi::CompactLatticeArc> *ofst,
     DeterminizeLatticePrunedOptions opts);
 
 template
 bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
-    MutableFst<kaldi::LatticeArc> *ofst, 
+    MutableFst<kaldi::LatticeArc> *ofst,
     DeterminizeLatticePrunedOptions opts);
 
 template
diff --git a/src/nnet3bin/nnet3-average.cc b/src/nnet3bin/nnet3-average.cc
index c82e3b93323..9d4513775d6 100644
--- a/src/nnet3bin/nnet3-average.cc
+++ b/src/nnet3bin/nnet3-average.cc
@@ -73,10 +73,10 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
-    string weights_str;
+    std::string weights_str;
     po.Register("weights", &weights_str, "Colon-separated list of weights, one "
                 "for each input model.  These will be normalized to sum to one.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 2) {
@@ -90,23 +90,23 @@ int main(int argc, char *argv[]) {
 
     Nnet nnet;
     ReadKaldiObject(first_nnet_rxfilename, &nnet);
-    
+
     int32 num_inputs = po.NumArgs() - 1;
 
     std::vector<BaseFloat> model_weights;
     GetWeights(weights_str, num_inputs, &model_weights);
-    
+
     ScaleNnet(model_weights[0], &nnet);
-              
+
     for (int32 i = 2; i <= num_inputs; i++) {
       Nnet src_nnet;
       ReadKaldiObject(po.GetArg(i), &src_nnet);
       AddNnet(src_nnet, model_weights[i - 1], &nnet);
     }
-    
+
 
     WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
-    
+
     KALDI_LOG << "Averaged parameters of " << num_inputs
               << " neural nets, and wrote to " << nnet_wxfilename;
     return 0; // it will throw an exception if there are any problems.
@@ -115,4 +115,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/tools/Makefile b/tools/Makefile
index 8ca95ac95ff..4a8e08823a0 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -7,7 +7,7 @@ CC = gcc         # used for sph2pipe
 
 # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
-OPENFST_VERSION = 1.5.4
+OPENFST_VERSION = 1.6.0
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")

From 5bdd4308b4f9b5c404a135382a15b80a0c8fa61b Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sun, 8 Jan 2017 21:53:44 -0800
Subject: [PATCH 105/213] Stop relinking dynamic libraries whenever they are
 updated.

---
 src/makefiles/default_rules.mk | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index fda52521186..34abd905924 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -7,13 +7,13 @@ ifeq ($(KALDI_FLAVOR), dynamic)
       LIBFILE = lib$(LIBNAME).dylib
     endif
     LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
-    XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
+    EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
   else ifeq ($(shell uname), Linux)
     ifdef LIBNAME
       LIBFILE = lib$(LIBNAME).so
     endif
     LDFLAGS += -Wl,-rpath=$(shell readlink -f $(KALDILIBDIR))
-    XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).so)
+    EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).so)
   else  # Platform not supported
     $(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
   endif
@@ -31,11 +31,11 @@ $(LIBFILE): $(OBJFILES)
 	$(RANLIB) $(LIBNAME).a
 ifeq ($(KALDI_FLAVOR), dynamic)
 ifeq ($(shell uname), Darwin)
-	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(XDEPENDS) $(LDLIBS)
+	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(LDLIBS)
 	rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
 else ifeq ($(shell uname), Linux)
 	# Building shared library from static (static was compiled with -fPIC)
-	$(CXX) -shared -o $@ -Wl,--no-undefined -Wl,--as-needed  -Wl,-soname=$@,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive  $(LDFLAGS) $(XDEPENDS) $(LDLIBS)
+	$(CXX) -shared -o $@ -Wl,--no-undefined -Wl,--as-needed  -Wl,-soname=$@,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive $(LDFLAGS) $(LDLIBS)
 	rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
 else  # Platform not supported
 	$(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
@@ -47,7 +47,11 @@ endif
 # use the C++ compiler $(CXX) instead.
 LINK.o = $(CXX) $(LDFLAGS) $(TARGET_ARCH)
 
+ifeq ($(KALDI_FLAVOR), dynamic)
+$(BINFILES): $(LIBFILE)
+else
 $(BINFILES): $(LIBFILE) $(XDEPENDS)
+endif
 
 # Rule below would expand to, e.g.:
 # ../base/kaldi-base.a:
@@ -65,7 +69,11 @@ clean:
 distclean: clean
 	-rm -f .depend.mk
 
+ifeq ($(KALDI_FLAVOR), dynamic)
+$(TESTFILES): $(LIBFILE)
+else
 $(TESTFILES): $(LIBFILE) $(XDEPENDS)
+endif
 
 test_compile: $(TESTFILES)
 

From 2b6055e3a2f91ddb9c30080443fbfce81913e336 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sun, 8 Jan 2017 22:29:46 -0800
Subject: [PATCH 106/213] Add -Wno-deprecated-declarations compiler flag to
 stop warnings about fst::TokenType

---
 src/makefiles/cygwin.mk                 | 3 ++-
 src/makefiles/darwin.mk                 | 3 ++-
 src/makefiles/linux_atlas.mk            | 3 ++-
 src/makefiles/linux_atlas_arm.mk        | 3 ++-
 src/makefiles/linux_atlas_ppc64le.mk    | 3 ++-
 src/makefiles/linux_clapack.mk          | 3 ++-
 src/makefiles/linux_clapack_arm.mk      | 3 ++-
 src/makefiles/linux_openblas.mk         | 3 ++-
 src/makefiles/linux_openblas_arm.mk     | 3 ++-
 src/makefiles/linux_openblas_ppc64le.mk | 3 ++-
 src/makefiles/linux_x86_64_mkl.mk       | 3 ++-
 11 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index 14ece9d4ee7..e5657818ce5 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -11,7 +11,8 @@ $(error OPENFSTLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_CLAPACK -I../../tools/CLAPACK/ \
            -msse -msse2 \
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 5dbcd6f768b..24fbdca890f 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -11,7 +11,8 @@ $(error OPENFSTLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 9ab038295b6..929461831df 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -17,7 +17,8 @@ $(error ATLASLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -msse -msse2 -pthread -rdynamic \
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 0dfc32863b4..9b9c42257fb 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -17,7 +17,8 @@ $(error ATLASLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index aa121fc5cdc..a0c22927f2e 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -17,7 +17,8 @@ $(error ATLASLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index d8f8cf5668f..95c58d0ec22 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -11,7 +11,8 @@ $(error OPENFSTLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 -pthread -rdynamic \
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 432bd689f55..2b15193046b 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -11,7 +11,8 @@ $(error OPENFSTLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index a859fc7e272..b7b74bff89a 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -17,7 +17,8 @@ $(error OPENBLASLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -msse -msse2 -pthread -rdynamic \
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 00c4ae2bbdd..344879580aa 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -17,7 +17,8 @@ $(error OPENBLASLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 1e7a391dc79..9225f4922f0 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -17,7 +17,8 @@ $(error OPENBLASLIBS not defined.)
 endif
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index d2aee4a036f..595557a5ef4 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -25,7 +25,8 @@ endif
 MKLLIB ?= $(MKLROOT)/lib/em64t
 
 CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL -I$(MKLROOT)/include \
            -m64 -msse -msse2 -pthread -rdynamic \

From d94e77bf0825deece7f9a60bf881bbdc8026e7dc Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sun, 8 Jan 2017 22:43:33 -0800
Subject: [PATCH 107/213] Fix test code to conform with OpenFst-1.6 API.

---
 src/fstext/fstext-utils-test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 2802a84cca6..b016b53691f 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -213,7 +213,7 @@ template<class Arc>  void TestAcceptorMinimize() {
   RemoveWeights(fst);
 
   VectorFst<Arc> fst2(*fst);
-  AcceptorMinimize(&fst2);
+  internal::AcceptorMinimize(&fst2);
 
   assert(RandEquivalent(*fst, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
 

From 210b34129e65f020adbd5baae8cc0a2483ec0df1 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Mon, 9 Jan 2017 15:31:13 -0800
Subject: [PATCH 108/213] Add date/time info to travis script.

---
 tools/extras/travis_script.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 5aefdd3e543..2067476b553 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -50,13 +50,20 @@ CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"
 
+echo "Building tools..." [Time: $(date)]
 runvx cd tools
 runvx make openfst "$CCC" CXXFLAGS="$CF" -j$MAXPAR
 cd ..
+
+echo "Building src..." [Time: $(date)]
 runvx cd src
 runvx "$CCC" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr"
 runvx make all -j$MAXPAR
+
+echo "Running tests..." [Time: $(date)]
 runvx make test -k -j$MAXPAR
 
+echo "Done." [Time: $(date)]
+
 #runvx make mklibdir base matrix -j$MAXPAR
 #runvx make matrix/test

From d5fa6cc310f07b38fb9bb1369001bbf800a6487d Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Mon, 9 Jan 2017 22:23:29 -0800
Subject: [PATCH 109/213] Testing Travis CI with different build settings.

---
 tools/extras/travis_script.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 2067476b553..3ff284cbe8b 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -9,7 +9,7 @@
 #   LDFLAGS="-llapack"
 
 # Maximum make parallelism. Simply -j runs out of memory on Travis VM.
-MAXPAR=3
+MAXPAR=4
 
 # Directories with code that can be tested with Travis (space-separated)
 TESTABLE_DIRS="src/"
@@ -46,6 +46,7 @@ then
 fi
 
 # Prepare environment variables
+CXX=clang++-3.4
 CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"

From 9563cf148017181ff84c1d2cd3253be4c6abbed8 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Mon, 9 Jan 2017 22:39:09 -0800
Subject: [PATCH 110/213] Testing Travis CI with different build settings 2.

---
 tools/extras/travis_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 3ff284cbe8b..5bdb91515f8 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -46,7 +46,7 @@ then
 fi
 
 # Prepare environment variables
-CXX=clang++-3.4
+CXX=clang++
 CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"

From 5c83a0d3fc81c357db2c00230754035d90894c5e Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Mon, 9 Jan 2017 22:50:40 -0800
Subject: [PATCH 111/213] Testing Travis CI with different build settings 3.

---
 tools/extras/travis_script.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 5bdb91515f8..c8c6c2d7905 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -46,7 +46,6 @@ then
 fi
 
 # Prepare environment variables
-CXX=clang++
 CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"

From a32f1f8807cdf1681f23ff3ed903a2c4d44a370d Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 10 Jan 2017 10:52:36 -0800
Subject: [PATCH 112/213] Testing Travis CI with different build settings 4.

---
 tools/extras/travis_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index c8c6c2d7905..4386ca2e030 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -46,7 +46,7 @@ then
 fi
 
 # Prepare environment variables
-CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
+CF="\"$CFLAGS -O0 -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"
 

From 3017cbb1092708aa9a1ee0a24497cf90b4b96bdc Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 10 Jan 2017 11:27:23 -0800
Subject: [PATCH 113/213] Testing Travis CI with different build settings 5.

---
 tools/extras/travis_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 4386ca2e030..cbd427bb9bd 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -46,7 +46,7 @@ then
 fi
 
 # Prepare environment variables
-CF="\"$CFLAGS -O0 -g $(addsw -I $INCDIRS)\""
+CF="\"$CFLAGS -pipe -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"
 

From 097b8251c007de540689a15b23a0a4a22a7ee8e3 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 10 Jan 2017 11:53:10 -0800
Subject: [PATCH 114/213] Testing Travis CI with different build settings 6.

---
 tools/extras/travis_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index cbd427bb9bd..c8c6c2d7905 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -46,7 +46,7 @@ then
 fi
 
 # Prepare environment variables
-CF="\"$CFLAGS -pipe -g $(addsw -I $INCDIRS)\""
+CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"
 

From 1d237a05278a118df0838bf58e6a340c045dc342 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 18 Jan 2017 20:20:25 -0500
Subject: [PATCH 115/213] Add more specific compilation instructions in
 configure script

---
 src/configure | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/configure b/src/configure
index 3388d8ebd50..bf478b5b73f 100755
--- a/src/configure
+++ b/src/configure
@@ -1180,4 +1180,6 @@ if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
 # and possibly modifies the kaldi.mk file that we just generated.
 check_for_slow_expf;
 echo "SUCCESS"
+echo "To compile: make clean -j; make depend -j; make -j"
+echo " ... or e.g. -j 10, instead of -j, to use a specified number of CPUs"
 exit 0;

From 3379167272de6d20cc64164711ffc105c169d0bb Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 19 Jan 2017 16:23:28 -0500
Subject: [PATCH 116/213] [src] Fix compilation issues on mac

---
 src/matrix/kaldi-vector.cc      | 8 ++++----
 src/nnet3/nnet-example-utils.cc | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 87237369680..057569d1182 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -1029,8 +1029,8 @@ template<typename OtherReal>
 void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   // remove __restrict__ if it causes compilation problems.
-  register Real *__restrict__ data = data_;
-  register OtherReal *__restrict__ other_data = v.data_;
+  Real *__restrict__ data = data_;
+  OtherReal *__restrict__ other_data = v.data_;
   MatrixIndexT dim = dim_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
@@ -1050,8 +1050,8 @@ template<typename OtherReal>
 void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   // remove __restrict__ if it causes compilation problems.
-  register Real *__restrict__ data = data_;
-  register OtherReal *__restrict__ other_data = v.data_;
+  Real *__restrict__ data = data_;
+  OtherReal *__restrict__ other_data = v.data_;
   MatrixIndexT dim = dim_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 7c3743c3a7f..088772bcba7 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -319,7 +319,7 @@ void ExampleGenerationConfig::ComputeDerived() {
     }
     KALDI_LOG << "Rounding up --num-frames=" << num_frames_str
               << " to multiples of --frame-subsampling-factor=" << m
-              << ", to: " << rounded_num_frames_str;
+              << ", to: " << rounded_num_frames_str.str();
   }
 }
 

From 23523213a1cf07662759333215473fefe81c920a Mon Sep 17 00:00:00 2001
From: Yiming Wang <freewym@gmail.com>
Date: Thu, 19 Jan 2017 23:25:05 -0500
Subject: [PATCH 117/213] [src] nnet3: removed the declaration of SetZero() in
 nnet-utils.h (#1358)

---
 src/nnet3/nnet-utils.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 2bcb0fdb0f6..95c28caf746 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -53,17 +53,6 @@ int32 NumOutputNodes(const Nnet &nnet);
 /// returns the number of input nodes of this nnet.
 int32 NumInputNodes(const Nnet &nnet);
 
-/// Calls SetZero (with the given is_gradient parameter) on all updatable
-/// components of the nnet; calls ZeroComponentStats on all other components
-/// that inherit from NonlinearComponent; and (just in case) calls Scale(0.0) on
-/// all other components.
-/// It's the same as ScaleNnet(0.0, nnet) except that if is_gradient is true it
-/// can set the is_gradient_ flag on updatable components [to force simple
-/// update]; and unlike ScaleNnet(0.0, nnet) it will get rid of NaNs that have
-/// crept into the parameters or stats.
-void SetZero(bool is_gradient,
-             Nnet *nnet);
-
 /// Calls PerturbParams (with the given stddev) on all updatable components of
 /// the nnet.
 void PerturbParams(BaseFloat stddev,

From b91711ccc24e0fd57550f82ec28d2cd452e96efc Mon Sep 17 00:00:00 2001
From: Kirill Katsnelson <kkm@pobox.com>
Date: Fri, 20 Jan 2017 00:03:08 -0800
Subject: [PATCH 118/213] [build] Enable Travis CI on the 'shortcut' branch
 (#1359)

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 85bbc7a52e4..d3ad85363ce 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,6 +20,7 @@ addons:
 branches:
   only:
     - master
+    - shortcut
 
 before_install:
   - cat /proc/sys/kernel/core_pattern

From 8d64376fea36cd4a4eda164e65bb8dd043e750b1 Mon Sep 17 00:00:00 2001
From: Ke Li <kli26@jhu.edu>
Date: Mon, 23 Jan 2017 01:09:11 -0500
Subject: [PATCH 119/213] [src] cudamatrix: modify test code to guarantee loop
 only run once if no GPU (#1366)

---
 src/cudamatrix/cu-array-test.cc           |  5 +++--
 src/cudamatrix/cu-block-matrix-test.cc    |  6 ++++--
 src/cudamatrix/cu-device-test.cc          |  7 ++++---
 src/cudamatrix/cu-math-test.cc            |  5 +++--
 src/cudamatrix/cu-matrix-speed-test.cc    |  6 +++---
 src/cudamatrix/cu-matrix-test.cc          |  9 +++++----
 src/cudamatrix/cu-rand-speed-test.cc      |  2 +-
 src/cudamatrix/cu-sp-matrix-speed-test.cc |  2 +-
 src/cudamatrix/cu-sp-matrix-test.cc       |  6 +++---
 src/cudamatrix/cu-sparse-matrix-test.cc   | 19 +++++++++++--------
 src/cudamatrix/cu-test.cc                 |  6 ++----
 src/cudamatrix/cu-tp-matrix-test.cc       |  6 +++---
 src/cudamatrix/cu-vector-speed-test.cc    |  2 +-
 src/cudamatrix/cu-vector-test.cc          |  8 ++++----
 14 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/src/cudamatrix/cu-array-test.cc b/src/cudamatrix/cu-array-test.cc
index f3ebcb72ee0..863ca5dde18 100644
--- a/src/cudamatrix/cu-array-test.cc
+++ b/src/cudamatrix/cu-array-test.cc
@@ -116,8 +116,9 @@ static void UnitTestCuArray() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -134,8 +135,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-block-matrix-test.cc b/src/cudamatrix/cu-block-matrix-test.cc
index 4193e61c609..387749904b1 100644
--- a/src/cudamatrix/cu-block-matrix-test.cc
+++ b/src/cudamatrix/cu-block-matrix-test.cc
@@ -181,8 +181,9 @@ template<typename Real> void CuBlockMatrixUnitTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -200,12 +201,13 @@ int main() {
 #else
     kaldi::CuBlockMatrixUnitTest<double>();
 #endif
+
     if (loop == 0)
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-device-test.cc b/src/cudamatrix/cu-device-test.cc
index ec0fa7b1f9f..8f44985ede0 100644
--- a/src/cudamatrix/cu-device-test.cc
+++ b/src/cudamatrix/cu-device-test.cc
@@ -99,8 +99,8 @@ void CudaMatrixResizeTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
+  for (int32 loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -118,9 +118,10 @@ int main() {
 #else
     kaldi::CudaMatrixResizeTest<double>();
 #endif
-  }
+
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 9a78c652745..abd93fb1a0a 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -537,8 +537,9 @@ template<typename Real> void CudaMathUnitTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -562,8 +563,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index 032351564c0..0e139cf9ec3 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -1085,8 +1085,8 @@ template<typename Real> void CudaMatrixSpeedTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
+  for (int32 loop = 0; loop < 2; loop++) {
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
     else
@@ -1103,9 +1103,9 @@ int main() {
 #else
     kaldi::CudaMatrixSpeedTest<double>();
 #endif
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index a6f84f3f6aa..38c800d8e58 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2707,8 +2707,9 @@ template<typename Real> void CudaMatrixUnitTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -2718,7 +2719,6 @@ int main() {
 
     kaldi::CudaMatrixUnitTest<float>();
 
-
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
       kaldi::CudaMatrixUnitTest<double>();
@@ -2733,9 +2733,10 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
-  SetVerboseLevel(4);
+    
+    SetVerboseLevel(4);
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-rand-speed-test.cc b/src/cudamatrix/cu-rand-speed-test.cc
index 23f82eab977..abcae76c598 100644
--- a/src/cudamatrix/cu-rand-speed-test.cc
+++ b/src/cudamatrix/cu-rand-speed-test.cc
@@ -218,5 +218,5 @@ int main() {
 
   CuDevice::Instantiate().PrintProfile();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
diff --git a/src/cudamatrix/cu-sp-matrix-speed-test.cc b/src/cudamatrix/cu-sp-matrix-speed-test.cc
index 455bf58608f..ded4baed49b 100644
--- a/src/cudamatrix/cu-sp-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-speed-test.cc
@@ -146,5 +146,5 @@ int main() {
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().PrintProfile();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
diff --git a/src/cudamatrix/cu-sp-matrix-test.cc b/src/cudamatrix/cu-sp-matrix-test.cc
index 3e3991afc81..c0f1119acea 100644
--- a/src/cudamatrix/cu-sp-matrix-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-test.cc
@@ -363,9 +363,9 @@ template<typename Real, typename OtherReal> void CudaSpMatrixUnitTest() {
 
 int main() {
   using namespace kaldi;
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -394,8 +394,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index 8f885815c72..6514ddbfa87 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -25,6 +25,8 @@
 #include "util/common-utils.h"
 #include "cudamatrix/cu-matrix-lib.h"
 
+using namespace kaldi;
+
 namespace kaldi {
 
 template <typename Real>
@@ -185,19 +187,20 @@ void CudaSparseMatrixUnitTest() {
 
 
 int main() {
-  for (kaldi::int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
-    kaldi::CuDevice::Instantiate().SetDebugStrideMode(true);
+  for (; loop < 2; loop++) {
+    CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
-      kaldi::CuDevice::Instantiate().SelectGpuId("no");
+      CuDevice::Instantiate().SelectGpuId("no");
     else
-      kaldi::CuDevice::Instantiate().SelectGpuId("yes");
+      CuDevice::Instantiate().SelectGpuId("yes");
 #endif
 
     kaldi::CudaSparseMatrixUnitTest<float>();
 
 #if HAVE_CUDA == 1
-    if (kaldi::CuDevice::Instantiate().DoublePrecisionSupported()) {
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
       kaldi::CudaSparseMatrixUnitTest<double>();
     } else {
       KALDI_WARN << "Double precision not supported";
@@ -210,10 +213,10 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
-  kaldi::SetVerboseLevel(4);
+    SetVerboseLevel(4);
 #if HAVE_CUDA == 1
-  kaldi::CuDevice::Instantiate().PrintProfile();
+  }
+  CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
 }
diff --git a/src/cudamatrix/cu-test.cc b/src/cudamatrix/cu-test.cc
index c27e2b64691..66b62f097c9 100644
--- a/src/cudamatrix/cu-test.cc
+++ b/src/cudamatrix/cu-test.cc
@@ -575,9 +575,8 @@ static void CuMatrixUnitTest() {
 
 int main() {
   using namespace kaldi;
-
-  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
+  for (int32 loop = 0; loop < 2; loop++) {
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
     else
@@ -593,9 +592,8 @@ int main() {
     {
       kaldi::CuMatrixUnitTest<double>();
     }
-  }
-
 #if HAVE_CUDA == 1
+  }
   kaldi::CuDevice::Instantiate().PrintProfile();
 #endif
   
diff --git a/src/cudamatrix/cu-tp-matrix-test.cc b/src/cudamatrix/cu-tp-matrix-test.cc
index 675cd19a56c..f5018aef6b7 100644
--- a/src/cudamatrix/cu-tp-matrix-test.cc
+++ b/src/cudamatrix/cu-tp-matrix-test.cc
@@ -187,9 +187,9 @@ template<typename Real> void CudaTpMatrixUnitTest() {
 int main() {
   using namespace kaldi;
 
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -211,8 +211,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-vector-speed-test.cc b/src/cudamatrix/cu-vector-speed-test.cc
index 81f6f2bf14d..cf3f126937f 100644
--- a/src/cudamatrix/cu-vector-speed-test.cc
+++ b/src/cudamatrix/cu-vector-speed-test.cc
@@ -319,6 +319,6 @@ int main() {
 #else
   kaldi::CudaVectorSpeedTest<double>();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
 
diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc
index a17a7baa930..6537bab70c6 100644
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@@ -755,9 +755,10 @@ int main(int argc, char *argv[]) {
     po.PrintUsage();
     exit(1);
   }
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -765,7 +766,6 @@ int main(int argc, char *argv[]) {
       CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
-
     kaldi::CuVectorUnitTest<float>();
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
@@ -781,8 +781,8 @@ int main(int argc, char *argv[]) {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;

From fd5238ad579b4beaa60f8b30818622ac670a4c6a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 22 Jan 2017 23:30:09 -0500
Subject: [PATCH 120/213] [src] nnet3: Extending nnet3-combine to support soft
 enforcement of sum-to-one.

---
 src/nnet3/nnet-combine.cc | 177 ++++++++++++++++++++++++++------------
 src/nnet3/nnet-combine.h  |  61 +++++++++----
 2 files changed, 166 insertions(+), 72 deletions(-)

diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 07a96d143c2..d50b5adc072 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -34,7 +34,13 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
     nnet_params_(std::min(num_nnets, config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+
+  if (config_.sum_to_one_penalty != 0.0 &&
+      config_.enforce_sum_to_one) {
+    KALDI_WARN << "--sum-to-one-penalty=" << config_.sum_to_one_penalty
+              << " is nonzero, so setting --enforce-sum-to-one=false.";
+    config_.enforce_sum_to_one = false;
+  }
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
@@ -43,7 +49,6 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
   NnetComputeProbOptions compute_prob_opts;
   compute_prob_opts.compute_deriv = true;
   prob_computer_ = new NnetComputeProb(compute_prob_opts, nnet_);
-
 }
 
 void NnetCombiner::ComputeUpdatableComponentDims(){
@@ -130,12 +135,12 @@ void NnetCombiner::Combine() {
                          // itself, so this is BFGS.
   lbfgs_options.first_step_impr = config_.initial_impr;
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  BaseFloat objf, initial_objf;
+  Vector<double> params(dim), deriv(dim);
+  double objf, initial_objf;
   GetInitialParameters(&params);
 
 
-  OptimizeLbfgs<BaseFloat> lbfgs(params, lbfgs_options);
+  OptimizeLbfgs<double> lbfgs(params, lbfgs_options);
 
   for (int32 i = 0; i < config_.num_iters; i++) {
     params.CopyFromVec(lbfgs.GetProposedValue());
@@ -146,12 +151,25 @@ void NnetCombiner::Combine() {
     lbfgs.DoStep(objf, deriv);
   }
 
-  KALDI_LOG << "Combining nnets, objective function changed from "
-            << initial_objf << " to " << objf;
+  if (!config_.sum_to_one_penalty) {
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf;
+  } else {
+    Vector<double> weights(WeightDim());
+    GetWeights(params, &weights);
+    bool print_weights = true;
+    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
+    // note: initial_objf has no penalty term because it summed exactly
+    // to one.
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf << " = "
+              << (objf - penalty) << " + " << penalty;
+  }
+
 
   // must recompute nnet_ if "params" is not exactly equal to the
   // final params that LB
-  Vector<BaseFloat> final_params(dim);
+  Vector<double> final_params(dim);
   final_params.CopyFromVec(lbfgs.GetValue(&objf));
   if (!params.ApproxEqual(final_params, 0.0)) {
     // the following call makes sure that nnet_ corresponds to the parameters
@@ -162,9 +180,9 @@ void NnetCombiner::Combine() {
 }
 
 
-void NnetCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
+void NnetCombiner::PrintParams(const VectorBase<double> &params) const {
 
-  Vector<BaseFloat> weights(params.Dim()), normalized_weights(params.Dim());
+  Vector<double> weights(params.Dim()), normalized_weights(params.Dim());
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   int32 num_models = nnet_params_.NumRows(),
@@ -214,21 +232,21 @@ void NnetCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
 bool NnetCombiner::SelfTestDerivatives() {
   int32 num_tests = 2;  // more properly, this is the number of dimensions in a
                         // single test.
-  BaseFloat delta = 0.001;
+  double delta = 0.001;
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  BaseFloat initial_objf = ComputeObjfAndDerivFromParameters(params,
+  double initial_objf = ComputeObjfAndDerivFromParameters(params,
                                                              &deriv);
   for (int32 i = 0; i < num_tests; i++) {
-    Vector<BaseFloat> new_deriv(dim), offset(dim), new_params(params);
+    Vector<double> new_deriv(dim), offset(dim), new_params(params);
     offset.SetRandn();
     new_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromParameters(new_params,
+    double new_objf = ComputeObjfAndDerivFromParameters(new_params,
                                                            &new_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -236,7 +254,7 @@ bool NnetCombiner::SelfTestDerivatives() {
         0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "predicted_changes = " << predicted_changes;
   KALDI_LOG << "observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold)) {
@@ -253,23 +271,23 @@ void NnetCombiner::SelfTestModelDerivatives() {
                         // single test.
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
+  Vector<BaseFloat> nnet_params(NnetParameterDim(), kUndefined),
       nnet_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
 
-  BaseFloat initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
+  double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
                                                        &nnet_deriv);
 
-  BaseFloat delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
-                                      NnetParameterDim());
+  double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
+                                   NnetParameterDim());
 
 
   for (int32 i = 0; i < num_tests; i++) {
@@ -277,7 +295,7 @@ void NnetCombiner::SelfTestModelDerivatives() {
         offset(NnetParameterDim()), new_nnet_params(nnet_params);
     offset.SetRandn();
     new_nnet_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
+    double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
                                                      &new_nnet_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -287,7 +305,7 @@ void NnetCombiner::SelfTestModelDerivatives() {
         0.5 * VecVec(nnet_params, new_nnet_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes;
   KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold))
@@ -305,7 +323,7 @@ int32 NnetCombiner::ParameterDim() const {
 }
 
 
-void NnetCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) const {
+void NnetCombiner::GetInitialParameters(VectorBase<double> *params) const {
   KALDI_ASSERT(params->Dim() == ParameterDim());
   params->Set(1.0 / nnet_params_.NumRows());
   if (config_.enforce_positive_weights) {
@@ -315,8 +333,8 @@ void NnetCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) const {
   }
 }
 
-void NnetCombiner::GetWeights(const VectorBase<BaseFloat> &params,
-                              VectorBase<BaseFloat> *weights) const {
+void NnetCombiner::GetWeights(const VectorBase<double> &params,
+                              VectorBase<double> *weights) const {
   KALDI_ASSERT(weights->Dim() == WeightDim());
   if (config_.separate_weights_per_component) {
     weights->CopyFromVec(params);
@@ -336,12 +354,12 @@ void NnetCombiner::GetWeights(const VectorBase<BaseFloat> &params,
 }
 
 
-void NnetCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                                  const VectorBase<BaseFloat> &weights_deriv,
-                                  VectorBase<BaseFloat> *param_deriv) {
+void NnetCombiner::GetParamsDeriv(const VectorBase<double> &weights,
+                                  const VectorBase<double> &weights_deriv,
+                                  VectorBase<double> *param_deriv) {
   KALDI_ASSERT(weights.Dim() == WeightDim() &&
                param_deriv->Dim() == ParameterDim());
-  Vector<BaseFloat> preexp_weights_deriv(weights_deriv);
+  Vector<double> preexp_weights_deriv(weights_deriv);
   if (config_.enforce_positive_weights) {
     // to enforce positive weights we first compute weights (call these
     // preexp_weights) and then take exponential.  Note, d/dx exp(x) = exp(x).
@@ -361,7 +379,55 @@ void NnetCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
 }
 
 
-void NnetCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
+double NnetCombiner::GetSumToOnePenalty(
+    const VectorBase<double> &weights,
+    VectorBase<double> *weights_penalty_deriv,
+    bool print_weights) const {
+
+  KALDI_ASSERT(config_.sum_to_one_penalty >= 0.0);
+  double penalty = config_.sum_to_one_penalty;
+  if (penalty == 0.0) {
+    weights_penalty_deriv->SetZero();
+    return 0.0;
+  }
+  double ans = 0.0;
+  int32 num_uc = NumUpdatableComponents(),
+    num_models = nnet_params_.NumRows();
+  Vector<double> tot_weights(num_uc);
+  std::ostringstream tot_weight_info;
+  for (int32 c = 0; c < num_uc; c++) {
+    double this_total_weight = 0.0;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      double this_weight = weights(index);
+      this_total_weight += this_weight;
+    }
+    tot_weights(c) = this_total_weight;
+    ans += -0.5 * penalty *
+           (this_total_weight - 1.0) * (this_total_weight - 1.0);
+    if (weights_penalty_deriv != NULL) {
+      KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
+      // this_total_weight_deriv is the derivative of the penalty
+      // term w.r.t. this component's total weight.
+      double this_total_weight_deriv =
+          penalty * (1.0 - this_total_weight);
+      for (int32 m = 0; m < num_models; m++) {
+        int32 index = m * num_uc + c;
+        (*weights_penalty_deriv)(index) = this_total_weight_deriv;
+      }
+    }
+  }
+  if (print_weights) {
+    Vector<BaseFloat> tot_weights_float(tot_weights);
+    KALDI_LOG << "Total weights per component: "
+              << PrintVectorPerUpdatableComponent(nnet_,
+                                                  tot_weights_float);
+  }
+  return ans;
+}
+
+
+void NnetCombiner::GetNnetParameters(const Vector<double> &weights,
                                      VectorBase<BaseFloat> *nnet_params) const {
   KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
   nnet_params->SetZero();
@@ -387,7 +453,7 @@ void NnetCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
 // compare GetNnetParameters.
 void NnetCombiner::GetWeightsDeriv(
     const VectorBase<BaseFloat> &nnet_params_deriv,
-    VectorBase<BaseFloat> *weights_deriv) {
+    VectorBase<double> *weights_deriv) {
   KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() &&
                weights_deriv->Dim() == WeightDim());
   int32 num_uc = NumUpdatableComponents(),
@@ -438,30 +504,35 @@ double NnetCombiner::ComputeObjfAndDerivFromNnet(
 
 
 double NnetCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<BaseFloat> &params,
-    VectorBase<BaseFloat> *params_deriv) {
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
-      nnet_params_deriv(NnetParameterDim(), kUndefined),
+    VectorBase<double> &params,
+    VectorBase<double> *params_deriv) {
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
+      weights_sum_to_one_penalty_deriv(WeightDim()),
       normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
+  Vector<BaseFloat>
+      nnet_params(NnetParameterDim(), kUndefined),
+      nnet_params_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
+  double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
-  double ans = ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
+  ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
   if (ans != ans || ans - ans != 0) // NaN or inf
     return ans;  // No point computing derivative
   GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv);
   GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv,
                               &weights_deriv);
+  weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv);
   GetParamsDeriv(weights, weights_deriv, params_deriv);
   return ans;
 }
 
 
-// enforces the constraint that the weights for each component must sum to one.
+// enforces the constraint that the weights for each component must sum to one,
+// if necessary.
 void NnetCombiner::GetNormalizedWeights(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    VectorBase<BaseFloat> *norm_weights) const {
+    const VectorBase<double> &unnorm_weights,
+    VectorBase<double> *norm_weights) const {
   if (!config_.enforce_sum_to_one) {
     norm_weights->CopyFromVec(unnorm_weights);
     return;
@@ -469,12 +540,12 @@ void NnetCombiner::GetNormalizedWeights(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
+    double inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
                                     // weights and eventually -inf objective.
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
@@ -484,9 +555,9 @@ void NnetCombiner::GetNormalizedWeights(
 }
 
 void NnetCombiner::GetUnnormalizedWeightsDeriv(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    const VectorBase<BaseFloat> &norm_weights_deriv,
-    VectorBase<BaseFloat> *unnorm_weights_deriv) {
+    const VectorBase<double> &unnorm_weights,
+    const VectorBase<double> &norm_weights_deriv,
+    VectorBase<double> *unnorm_weights_deriv) {
   if (!config_.enforce_sum_to_one) {
     unnorm_weights_deriv->CopyFromVec(norm_weights_deriv);
     return;
@@ -494,13 +565,13 @@ void NnetCombiner::GetUnnormalizedWeightsDeriv(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;
-    BaseFloat inv_sum_deriv = 0.0;
+    double inv_sum = 1.0 / sum;
+    double inv_sum_deriv = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       // in the forward direction, we'd do:
@@ -509,7 +580,7 @@ void NnetCombiner::GetUnnormalizedWeightsDeriv(
       inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index);
     }
     // note: d/dx (1/x) = -1/x^2
-    BaseFloat sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
+    double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       (*unnorm_weights_deriv)(index) += sum_deriv;
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
index a2883dab5b2..5b60d30b8ed 100644
--- a/src/nnet3/nnet-combine.h
+++ b/src/nnet3/nnet-combine.h
@@ -48,6 +48,7 @@ struct NnetCombineConfig {
   bool test_gradient;
   bool enforce_positive_weights;
   bool enforce_sum_to_one;
+  BaseFloat sum_to_one_penalty;
   bool separate_weights_per_component;
   NnetCombineConfig(): num_iters(60),
                        initial_impr(0.01),
@@ -55,6 +56,7 @@ struct NnetCombineConfig {
                        test_gradient(false),
                        enforce_positive_weights(false),
                        enforce_sum_to_one(false),
+                       sum_to_one_penalty(0.0),
                        separate_weights_per_component(true) { }
 
   void Register(OptionsItf *po) {
@@ -73,6 +75,11 @@ struct NnetCombineConfig {
                  "If true, enforce that all weights are positive.");
     po->Register("enforce-sum-to-one", &enforce_sum_to_one, "If true, enforce that "
                  "the model weights for each component should sum to one.");
+    po->Register("sum-to-one-penalty", &sum_to_one_penalty, "If >0, a penalty term "
+                 "on the squared difference between sum(weights) for one component,"
+                 " and 1.0. This is like --enforce-sum-to-one, but done in a 'soft' "
+                 "way (e.g. maybe useful with dropout).  We suggest small values "
+                 "like 10e-3 (for regular nnets) or 1.0e-04 (for chain models).");
     po->Register("separate-weights-per-component", &separate_weights_per_component,
                  "If true, have a separate weight for each updatable component in "
                  "the nnet.");
@@ -104,7 +111,7 @@ class NnetCombiner {
 
   ~NnetCombiner() { delete prob_computer_; }
  private:
-  const NnetCombineConfig &config_;
+  NnetCombineConfig config_;
 
   const std::vector<NnetExample> &egs_;
 
@@ -126,8 +133,9 @@ class NnetCombiner {
   Matrix<BaseFloat> nnet_params_;
 
   // This vector has the same dimension as nnet_params_.NumRows(),
-  // and helps us normalize so each row of nnet_params correspondss to
-  // a weighted average of its inputs.
+  // and helps us normalize so each row of nnet_params corresponds to
+  // a weighted average of its inputs (will be all ones if
+  // config_.max_effective_inputs >= the number of nnets provided).
   Vector<BaseFloat> tot_input_weighting_;
 
   // returns the parameter dimension, i.e. the dimension of the parameters that
@@ -149,7 +157,7 @@ class NnetCombiner {
   // Computes the initial parameters.  The parameters are the underlying thing
   // that we optimize; their dimension equals ParameterDim().  They are not the same
   // thing as the nnet parameters.
-  void GetInitialParameters(VectorBase<BaseFloat> *params) const;
+  void GetInitialParameters(VectorBase<double> *params) const;
 
   // Tests that derivatives are accurate.  Prints warning and returns false if not.
   bool SelfTestDerivatives();
@@ -159,33 +167,48 @@ class NnetCombiner {
 
 
   // prints the parameters via logging statements.
-  void PrintParams(const VectorBase<BaseFloat> &params) const;
+  void PrintParams(const VectorBase<double> &params) const;
 
   // This function computes the objective function (and its derivative, if the objective
   // function is finite) at the given value of the parameters (the parameters we're optimizing,
   // i.e. the combination weights; not the nnet parameters.  This function calls most of the
   // functions below.
   double ComputeObjfAndDerivFromParameters(
-      VectorBase<BaseFloat> &params,
-      VectorBase<BaseFloat> *params_deriv);
+      VectorBase<double> &params,
+      VectorBase<double> *params_deriv);
 
 
   // Computes the weights from the parameters in a config-dependent way.  The
   // weight dimension is always (the number of updatable components times
   // nnet_params_.NumRows()).
-  void GetWeights(const VectorBase<BaseFloat> &params,
-                  VectorBase<BaseFloat> *weights) const;
+  void GetWeights(const VectorBase<double> &params,
+                  VectorBase<double> *weights) const;
 
   // Given the raw weights: if config_.enforce_sum_to_one, then compute weights
   // with sum-to-one constrint per component included; else just copy input to
   // output.
-  void GetNormalizedWeights(const VectorBase<BaseFloat> &unnorm_weights,
-                            VectorBase<BaseFloat> *norm_weights) const;
+  void GetNormalizedWeights(const VectorBase<double> &unnorm_weights,
+                            VectorBase<double> *norm_weights) const;
+
+  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
+  // weights_penalty_deriv to 0.0; else it computes, for each
+  // updatable component u the total weight w_u, returns the value
+  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
+  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
+  // the result.
+  // Note: config_.sum_to_one_penalty is exclusive with
+  // config_.enforce_sum_to_one, so there is really no distinction between
+  // normalized and unnormalized weights here (since normalization would be a
+  // no-op).
+  double GetSumToOnePenalty(const VectorBase<double> &weights,
+                            VectorBase<double> *weights_penalty_deriv,
+                            bool print_weights = false) const;
+
 
   // Computes the nnet-parameter vector from the normalized weights and
   // nnet_params_, as a vector.  (See the functions Vectorize() and
   // UnVectorize() for how they relate to the nnet's components' parameters).
-  void GetNnetParameters(const Vector<BaseFloat> &normalized_weights,
+  void GetNnetParameters(const Vector<double> &normalized_weights,
                          VectorBase<BaseFloat> *nnet_params) const;
 
   // This function computes the objective function (and its derivative, if the objective
@@ -197,23 +220,23 @@ class NnetCombiner {
   // Given an objective-function derivative with respect to the nnet parameters,
   // computes the derivative with respect to the (normalized) weights.
   void GetWeightsDeriv(const VectorBase<BaseFloat> &nnet_params_deriv,
-                       VectorBase<BaseFloat> *normalized_weights_deriv);
+                       VectorBase<double> *normalized_weights_deriv);
 
 
   // Computes the derivative w.r.t. the unnormalized weights, by propagating
   // through the normalization operation.
   // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to
   // unnorm_weights_deriv.
-  void GetUnnormalizedWeightsDeriv(const VectorBase<BaseFloat> &unnorm_weights,
-                                   const VectorBase<BaseFloat> &norm_weights_deriv,
-                                   VectorBase<BaseFloat> *unnorm_weights_deriv);
+  void GetUnnormalizedWeightsDeriv(const VectorBase<double> &unnorm_weights,
+                                   const VectorBase<double> &norm_weights_deriv,
+                                   VectorBase<double> *unnorm_weights_deriv);
 
 
   // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
   // the params
-  void GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                      const VectorBase<BaseFloat> &weight_deriv,
-                      VectorBase<BaseFloat> *param_deriv);
+  void GetParamsDeriv(const VectorBase<double> &weights,
+                      const VectorBase<double> &weight_deriv,
+                      VectorBase<double> *param_deriv);
 
   void ComputeUpdatableComponentDims();
   void FinishPreprocessingInput();

From e7441330812957e2e3b563523aa5f7ab4b41176a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 22 Jan 2017 23:33:12 -0500
Subject: [PATCH 121/213] [src] Add more diagnostic output to lattice
 determinization programs

---
 src/lat/minimize-lattice.h                    |  1 +
 src/latbin/lattice-depth.cc                   |  6 +-
 src/latbin/lattice-determinize-non-compact.cc | 80 +++++++++++--------
 .../lattice-determinize-phone-pruned.cc       | 27 +++++--
 src/latbin/lattice-determinize-pruned.cc      | 18 +++++
 src/latbin/lattice-determinize.cc             | 36 ++++++---
 6 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/src/lat/minimize-lattice.h b/src/lat/minimize-lattice.h
index 60acfb27353..fcf6c0f36df 100644
--- a/src/lat/minimize-lattice.h
+++ b/src/lat/minimize-lattice.h
@@ -40,6 +40,7 @@ namespace fst {
 /// function will not combine as many states as it could, but it won't crash.
 /// Returns true on success, and false if it failed due to topological sorting
 /// failing.
+/// The output will be topologically sorted.
 template<class Weight, class IntType>
 bool MinimizeCompactLattice(
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *clat,
diff --git a/src/latbin/lattice-depth.cc b/src/latbin/lattice-depth.cc
index 93dfd5c966b..9a785c4b6a6 100644
--- a/src/latbin/lattice-depth.cc
+++ b/src/latbin/lattice-depth.cc
@@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
     using fst::VectorFst;
     using fst::StdArc;
     typedef StdArc::StateId StateId;
-    
+
     const char *usage =
         "Compute the lattice depths in terms of the average number of arcs that\n"
         "cross a frame.  See also lattice-depth-per-frame\n"
@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
         "E.g.: lattice-depth ark:- ark,t:-\n";
 
     ParseOptions po(usage);
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 1 || po.NumArgs() > 2) {
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
       std::string key = clat_reader.Key();
 
       TopSortCompactLatticeIfNeeded(&clat);
-      
+
       int32 t;
       BaseFloat depth = CompactLatticeDepth(clat, &t);
 
diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc
index f79262e0832..44ae8566f86 100644
--- a/src/latbin/lattice-determinize-non-compact.cc
+++ b/src/latbin/lattice-determinize-non-compact.cc
@@ -56,9 +56,9 @@ bool DeterminizeLatticeWrapper(const Lattice &lat,
       KALDI_WARN << "Detected empty lattice, skipping " << key;
       return false;
     }
-    
-    // The work gets done in the next line.  
-    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { 
+
+    // The work gets done in the next line.
+    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) {
       if (prune) PruneLattice(cur_beam, clat);
       return true;
     } else { // failed to determinize..
@@ -91,14 +91,14 @@ bool DeterminizeLatticeWrapper(const Lattice &lat,
 }
 
 void ComputeAcousticScoresMap(
-    const Lattice &lat, 
-    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+    const Lattice &lat,
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                         PairHasher<int32> > *acoustic_scores) {
   acoustic_scores->clear();
 
   std::vector<int32> state_times;
   LatticeStateTimes(lat, &state_times);
-  
+
   KALDI_ASSERT(lat.Start() == 0);
 
   for (StateId s = 0; s < lat.NumStates(); s++) {
@@ -111,17 +111,17 @@ void ComputeAcousticScoresMap(
       int32 tid = arc.ilabel;
 
       if (tid != 0) {
-        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
           PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
         if (it == acoustic_scores->end()) {
-          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), 
+          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid),
                                           std::make_pair(weight.Value2(), 1)));
         } else {
-          if (it->second.second == 2 
+          if (it->second.second == 2
                 && it->second.first / it->second.second != weight.Value2()) {
             KALDI_VLOG(2) << "Transitions on the same frame have different "
-                          << "acoustic costs for tid " << tid << "; " 
-                          << it->second.first / it->second.second 
+                          << "acoustic costs for tid " << tid << "; "
+                          << it->second.first / it->second.second
                           << " vs " << weight.Value2();
           }
           it->second.first += weight.Value2();
@@ -135,7 +135,7 @@ void ComputeAcousticScoresMap(
 
     LatticeWeight f = lat.Final(s);
     if (f != LatticeWeight::Zero()) {
-      // Final acoustic cost must be 0 as we are reading from 
+      // Final acoustic cost must be 0 as we are reading from
       // non-determinized, non-compact lattice
       KALDI_ASSERT(f.Value2() == 0.0);
     }
@@ -143,25 +143,25 @@ void ComputeAcousticScoresMap(
 }
 
 void ReplaceAcousticScoresFromMap(
-    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                         PairHasher<int32> > &acoustic_scores,
     Lattice *lat) {
   fst::TopSort(lat);
-  
+
   std::vector<int32> state_times;
   LatticeStateTimes(*lat, &state_times);
-  
+
   KALDI_ASSERT(lat->Start() == 0);
 
   for (StateId s = 0; s < lat->NumStates(); s++) {
     int32 t = state_times[s];
-    for (fst::MutableArcIterator<Lattice> aiter(lat, s); 
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
           !aiter.Done(); aiter.Next()) {
       Arc arc(aiter.Value());
- 
+
       int32 tid = arc.ilabel;
       if (tid != 0) {
-        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
           PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
         if (it == acoustic_scores.end()) {
           KALDI_ERR << "Could not find tid " << tid << " at time " << t
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: lattice-determinize-non-compact [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-determinize-non-compact --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
@@ -218,7 +218,7 @@ int main(int argc, char *argv[]) {
     BaseFloat delta = fst::kDelta;
     bool prune = false;
     bool minimize = false;
-    
+
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
     po.Register("beam", &beam,
@@ -238,7 +238,7 @@ int main(int argc, char *argv[]) {
                 "decrease beam by beam-ratio if determinization fails.");
     po.Register("minimize", &minimize,
                 "If true, push and minimize after determinization");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -252,12 +252,16 @@ int main(int argc, char *argv[]) {
     // Read as regular lattice-- this is the form we need it in for efficient
     // pruning.
     SequentialLatticeReader lattice_reader(lats_rspecifier);
-    
+
     // Write as regular lattice.
-    LatticeWriter lattice_writer(lats_wspecifier); 
+    LatticeWriter lattice_writer(lats_wspecifier);
 
     int32 n_done = 0, n_error = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
     LatticeWeight beam_weight(beam, static_cast<BaseFloat>(0.0));
@@ -265,21 +269,21 @@ int main(int argc, char *argv[]) {
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       Lattice lat = lattice_reader.Value();
-      
+
       lattice_reader.FreeCurrent();
-      
+
       fst::TopSort(&lat);
-      
+
       fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
 
 
-      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) 
-      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>, 
+      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count)
+      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>,
                                           PairHasher<int32> > acoustic_scores;
       ComputeAcousticScoresMap(lat, &acoustic_scores);
-      
+
       Invert(&lat); // make it so word labels are on the input.
-      
+
       CompactLattice clat;
       if (DeterminizeLatticeWrapper(lat, key, prune,
                                     beam, beam_ratio, max_mem, max_loop,
@@ -290,6 +294,13 @@ int main(int argc, char *argv[]) {
           MinimizeCompactLattice(&clat);
         }
 
+        int32 t;
+        TopSortCompactLatticeIfNeeded(&clat);
+        double depth = CompactLatticeDepth(clat, &t);
+        sum_depth_in += lat.NumStates();
+        sum_depth_out += depth * t;
+        sum_t += t;
+
         Lattice out_lat;
         fst::ConvertLattice(clat, &out_lat);
         fst::TopSort(&out_lat);
@@ -298,7 +309,7 @@ int main(int argc, char *argv[]) {
         // the computed map
         ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat);
 
-        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), 
+        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale),
                           &out_lat);
         lattice_writer.Write(key, out_lat);
         n_done++;
@@ -307,6 +318,12 @@ int main(int argc, char *argv[]) {
       }
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t <<  "frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error;
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
@@ -314,4 +331,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/latbin/lattice-determinize-phone-pruned.cc b/src/latbin/lattice-determinize-phone-pruned.cc
index 8df4bda1e1a..0959bcbcd74 100644
--- a/src/latbin/lattice-determinize-phone-pruned.cc
+++ b/src/latbin/lattice-determinize-phone-pruned.cc
@@ -28,7 +28,7 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     typedef kaldi::int32 int32;
-    
+
     const char *usage =
         "Determinize lattices, keeping only the best path (sequence of\n"
         "acoustic states) for each input-symbol sequence. This version does\n"
@@ -41,13 +41,13 @@ int main(int argc, char *argv[]) {
         "                  <lattice-rspecifier> <lattice-wspecifier>\n"
         " e.g.: lattice-determinize-phone-pruned --acoustic-scale=0.1 \\\n"
         "                            final.mdl ark:in.lats ark:det.lats\n";
-    
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
     fst::DeterminizeLatticePhonePrunedOptions opts;
     opts.max_mem = 50000000;
-    
+
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic"
                 " likelihoods.");
     po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling].");
@@ -69,12 +69,16 @@ int main(int argc, char *argv[]) {
     // Reads as regular lattice-- this is the form the determinization code
     // accepts.
     SequentialLatticeReader lat_reader(lats_rspecifier);
-    
+
     // Writes as compact lattice.
-    CompactLatticeWriter compact_lat_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lat_writer(lats_wspecifier);
 
     int32 n_done = 0, n_warn = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
 
@@ -95,11 +99,24 @@ int main(int argc, char *argv[]) {
         n_warn++;
       }
 
+      int32 t;
+      TopSortCompactLatticeIfNeeded(&det_clat);
+      double depth = CompactLatticeDepth(det_clat, &t);
+      sum_depth_in += lat.NumStates();
+      sum_depth_out += depth * t;
+      sum_t += t;
+
       fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
       compact_lat_writer.Write(key, det_clat);
       n_done++;
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t << " frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, determinization finished "
               << "earlier than specified by the beam on " << n_warn << " of "
               << "these.";
diff --git a/src/latbin/lattice-determinize-pruned.cc b/src/latbin/lattice-determinize-pruned.cc
index 1e6fa2d6de2..3e8bca5a3ce 100644
--- a/src/latbin/lattice-determinize-pruned.cc
+++ b/src/latbin/lattice-determinize-pruned.cc
@@ -74,6 +74,10 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0, n_warn = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
 
@@ -109,11 +113,25 @@ int main(int argc, char *argv[]) {
         PushCompactLatticeWeights(&det_clat);
         MinimizeCompactLattice(&det_clat);
       }
+
+      int32 t;
+      TopSortCompactLatticeIfNeeded(&det_clat);
+      double depth = CompactLatticeDepth(det_clat, &t);
+      sum_depth_in += lat.NumStates();
+      sum_depth_out += depth * t;
+      sum_t += t;
+
       fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
       compact_lat_writer.Write(key, det_clat);
       n_done++;
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t << " frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, determinization finished "
               << "earlier than specified by the beam (or output was empty) on "
               << n_warn << " of these.";
diff --git a/src/latbin/lattice-determinize.cc b/src/latbin/lattice-determinize.cc
index 8a5bd93e503..d59fcda7022 100644
--- a/src/latbin/lattice-determinize.cc
+++ b/src/latbin/lattice-determinize.cc
@@ -50,9 +50,9 @@ bool DeterminizeLatticeWrapper(const Lattice &lat,
       KALDI_WARN << "Detected empty lattice, skipping " << key;
       return false;
     }
-    
-    // The work gets done in the next line.  
-    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { 
+
+    // The work gets done in the next line.
+    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) {
       if (prune) PruneLattice(cur_beam, clat);
       return true;
     } else { // failed to determinize..
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: lattice-determinize [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-determinize --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
@@ -115,7 +115,7 @@ int main(int argc, char *argv[]) {
     BaseFloat delta = fst::kDelta;
     bool prune = false;
     bool minimize = false;
-    
+
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
     po.Register("beam", &beam,
@@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
                 "decrease beam by beam-ratio if determinization fails.");
     po.Register("minimize", &minimize,
                 "If true, push and minimize after determinization");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -150,12 +150,16 @@ int main(int argc, char *argv[]) {
     // Read as regular lattice-- this is the form we need it in for efficient
     // pruning.
     SequentialLatticeReader lattice_reader(lats_rspecifier);
-    
+
     // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
     int32 n_done = 0, n_error = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
     LatticeWeight beam_weight(beam, static_cast<BaseFloat>(0.0));
@@ -164,7 +168,7 @@ int main(int argc, char *argv[]) {
       std::string key = lattice_reader.Key();
       Lattice lat = lattice_reader.Value();
       Invert(&lat); // make it so word labels are on the input.
-      
+
       lattice_reader.FreeCurrent();
       fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
 
@@ -177,6 +181,14 @@ int main(int argc, char *argv[]) {
           PushCompactLatticeWeights(&clat);
           MinimizeCompactLattice(&clat);
         }
+
+        int32 t;
+        TopSortCompactLatticeIfNeeded(&clat);
+        double depth = CompactLatticeDepth(clat, &t);
+        sum_depth_in += lat.NumStates();
+        sum_depth_out += depth * t;
+        sum_t += t;
+
         fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &clat);
         compact_lattice_writer.Write(key, clat);
         n_done++;
@@ -185,6 +197,12 @@ int main(int argc, char *argv[]) {
       }
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t << " frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error;
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {

From 36bbf4479ef6b78d0101549b1dc9fb73082692c1 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 22 Jan 2017 23:37:31 -0500
Subject: [PATCH 122/213] [src] Extend nnet3 Nnet reading code to accept .mdl
 files

---
 src/nnet3/nnet-discriminative-training.cc | 13 +++++++++++++
 src/nnet3/nnet-nnet.cc                    | 22 ++++++++++++++++++++++
 src/nnet3/nnet-nnet.h                     |  2 ++
 src/nnet3/nnet-utils.h                    |  4 +++-
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index 5ef1675c5ca..fb4b7db8c3c 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -238,6 +238,19 @@ void DiscriminativeObjectiveFunctionInfo::PrintStatsForThisPhase(
 bool DiscriminativeObjectiveFunctionInfo::PrintTotalStats(const std::string &name,
                 const std::string &criterion) const {
   BaseFloat objf = stats.TotalObjf(criterion) /stats.tot_t_weighted;
+
+  double avg_gradients = (stats.tot_num_count + stats.tot_den_count) /
+                         stats.tot_t_weighted;
+  KALDI_LOG << "Average num+den count of stats is " << avg_gradients
+              << " per frame, over "
+              << stats.tot_t_weighted << " frames.";
+  if (stats.tot_l2_term != 0.0) {
+    KALDI_LOG << "Average l2 norm of output per frame is "
+              << (stats.tot_l2_term / stats.tot_t_weighted) << " over "
+              << stats.tot_t_weighted << " frames.";
+  }
+
+
   KALDI_LOG << "Overall average objective function for '" << name << "' is "
             << objf << " over " << stats.tot_t_weighted << " frames.";
   KALDI_LOG << "[this line is to be parsed by a script:] "
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index ad5f715a294..dd90af739e7 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -23,6 +23,8 @@
 #include "nnet3/nnet-parse.h"
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/am-nnet-simple.h"
+#include "hmm/transition-model.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -565,8 +567,28 @@ void Nnet::GetSomeNodeNames(
   }
 }
 
+void Nnet::Swap(Nnet *other) {
+  component_names_.swap(other->component_names_);
+  components_.swap(other->components_);
+  node_names_.swap(other->node_names_);
+  nodes_.swap(other->nodes_);
+}
+
 void Nnet::Read(std::istream &is, bool binary) {
   Destroy();
+  int first_char = PeekToken(is, binary);
+  if (first_char == 'T') {
+    // This branch is to allow '.mdl' files (containing a TransitionModel
+    // and then an AmNnetSimple) to be read where .raw files (containing
+    // just an Nnet) would be expected.  This is often convenient.
+    TransitionModel temp_trans_model;
+    temp_trans_model.Read(is, binary);
+    AmNnetSimple temp_am_nnet;
+    temp_am_nnet.Read(is, binary);
+    temp_am_nnet.GetNnet().Swap(this);
+    return;
+  }
+
   ExpectToken(is, binary, "<Nnet3>");
   std::ostringstream config_file_out;
   std::string cur_line;
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index 0e6918de18d..5eb87fd30f3 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -233,6 +233,8 @@ class Nnet {
 
   Nnet *Copy() const { return new Nnet(*this); }
 
+  void Swap(Nnet *other);
+
   // Assignment operator
   Nnet& operator =(const Nnet &nnet);
 
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 95c28caf746..3bda01271d2 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -179,7 +179,9 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes);
    ReadEditConfig() reads a file with a similar-looking format to the config file
    read by Nnet::ReadConfig(), but this consists of a sequence of operations to
    perform on an existing network, mostly modifying components.  It's one
-   "directive" (i.e. command) per line.
+   "directive" (i.e. command) per line, but if supplying the options via
+   the --edits option to programs like nnet3-am-copy, you can use a semicolon
+   in place of the newline to separate commands.
 
    The following describes the allowed commands.  Note: all patterns are like
    UNIX globbing patterns where the only metacharacter is '*', representing zero

From 6919a5a18f2305110b5033d6f1d06ce7077fcfe7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 23 Jan 2017 14:09:52 -0500
Subject: [PATCH 123/213] [src][egs] Various script updates/clarifications,
 remove no-op options; remove now-removed options from some
 discriminative-training egs scripts; various bug fixes/tuning.

---
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  7 --
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  6 --
 .../s5c/local/nnet3/compare_wer_general.sh    |  0
 .../local/nnet3/run_blstm_discriminative.sh   |  6 --
 .../local/nnet3/run_tdnn_discriminative.sh    |  6 --
 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh | 16 +++--
 .../s5c/local/nnet3/tuning/run_tdnn_d_disc.sh | 33 ++++-----
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  6 --
 .../nnet3/tuning/run_tdnn_lstm_1b_disc.sh     | 29 +++++---
 .../s5/local/nnet3/run_lstm_discriminative.sh |  5 --
 .../s5/local/nnet3/run_tdnn_discriminative.sh |  5 --
 egs/wsj/s5/steps/nnet2/train_tanh.sh          | 24 +++----
 egs/wsj/s5/steps/nnet3/align.sh               | 11 ++-
 egs/wsj/s5/steps/nnet3/get_degs.sh            | 63 ++++++++++-------
 .../s5/steps/nnet3/report/generate_plots.py   | 67 ++++++++-----------
 .../s5/steps/nnet3/train_discriminative.sh    | 12 +++-
 egs/wsj/s5/steps/shift_feats.sh               |  8 +--
 egs/wsj/s5/utils/data/limit_feature_dim.sh    |  2 +-
 18 files changed, 150 insertions(+), 156 deletions(-)
 mode change 100644 => 100755 egs/swbd/s5c/local/nnet3/compare_wer_general.sh

diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
index 7dc82ad34d1..324061aa5ac 100644
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -46,10 +46,6 @@ num_jobs_nnet=4
 num_epochs=2
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false
-                              # because it does not help in some setups
-last_layer_factor=0.1
-
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
 
@@ -126,7 +122,6 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
@@ -143,8 +138,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
index 365d01cc85d..b513e0908a5 100755
--- a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -53,9 +53,6 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false
-                              # because it does not help in some setups
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -133,7 +130,6 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $train_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
@@ -150,8 +146,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/swbd/s5c/local/nnet3/compare_wer_general.sh b/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
old mode 100644
new mode 100755
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index 349fd246022..c6dfb0107cd 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -52,9 +52,6 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false
-                              # because it does not help in some setups
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -138,7 +135,6 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
@@ -155,8 +151,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
index ceef60d0656..7af311e7ff4 100755
--- a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
@@ -46,9 +46,6 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false
-                              # because it does not help in some setups
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -126,7 +123,6 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
@@ -143,8 +139,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
index ec80972cf2d..b4b60688cdb 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
@@ -6,13 +6,15 @@
 # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
 # --num-threads 16 and --minibatch-size 128.
 
-# System                  tdnn_c   tdnn_d
-# WER on train_dev(tg)      17.37     16.72
-# WER on train_dev(fg)      15.94     15.31
-# WER on eval2000(tg)        20.0      19.2
-# WER on eval2000(fg)        18.2      17.8
-# Final train prob       -1.43781  -1.22859
-# Final valid prob       -1.56895    -1.354
+# note: the last column is a version of tdnn_d that was done after the
+# changes for the 5.1 version of Kaldi (variable minibatch-sizes, etc.)
+# System                  tdnn_c   tdnn_d       tdnn_d[repeat]
+# WER on train_dev(tg)      17.37     16.72      16.51
+# WER on train_dev(fg)      15.94     15.31      15.34
+# WER on eval2000(tg)        20.0      19.2        19.2
+# WER on eval2000(fg)        18.2      17.8       17.7
+# Final train prob       -1.43781  -1.22859      -1.22215
+# Final valid prob       -1.56895    -1.354     -1.31647
 
 stage=0
 affix=
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
index da7cae954f8..22f4004c056 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
@@ -29,7 +29,7 @@ nj=400 # have a high number of jobs because this could take a while, and we migh
 graph_dir=exp/tri4/graph_sw1_tg
 srcdir=exp/nnet3/tdnn_d_sp
 train_data_dir=data/train_nodup_sp_hires
-online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
 
 
 ## Objective options
@@ -37,7 +37,12 @@ criterion=smbr
 one_silence_class=true
 
 # you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
-disc_affix=
+# originally ran with no affix, with effective_learning_rate=0.0000125;
+# reran by mistake with no affix with effective_learning_rate=0.000005 [was a bit
+# better, see NOTES, but still best after 1st epoch].
+# reran again with affix=slow and effective_learning_rate=0.0000025
+
+disc_affix=slow
 
 dir=${srcdir}_${criterion}${disc_affix}
 
@@ -57,7 +62,7 @@ extra_right_context=0
 
 
 ## Nnet training options
-effective_learning_rate=0.0000125
+effective_learning_rate=0.0000025
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=3
@@ -66,8 +71,6 @@ regularization_opts=          # Applicable for providing --xent-regularize and -
 minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
                                       # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
 
-last_layer_factor=0.1         # prevent the final layer from learning too fast;
-                              # this can be a problem.
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -136,7 +139,6 @@ if [ $stage -le 3 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
@@ -145,15 +147,16 @@ if [ $stage -le 4 ]; then
     for decode_set in train_dev eval2000; do
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       for iter in epoch$x epoch${x}_adj; do
-
-        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_hires \
-          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
-
-        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-          $dir/decode_${decode_set}_${iter}_sw1_{tg,fsh_fg} || exit 1;
-      ) &
+        (
+          steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+            --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+            $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
+
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_${iter} || exit 1;
+        ) &
+      done
     done
   done
 fi
diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
index 805d38b4e88..8d7393af853 100755
--- a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -60,9 +60,6 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false
-                              # because it does not help in some setups
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -137,7 +134,6 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
@@ -154,8 +150,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
index 9a77a6af6c7..07c3d4af233 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -6,8 +6,8 @@
 # to use the non-cleaned data.
 #
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
-# since the lattice generation runs in about real-time, so takes of the order of
-# 1000 hours of CPU time.
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
 
 
 set -e
@@ -37,7 +37,11 @@ criterion=smbr
 one_silence_class=true
 
 # you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
-disc_affix=slow
+# note, I ran without affix with learning rate 0.0000125, with disc_affic=slow
+# with learning rate 0.000005, and with disc_affix=slow2 with learning rate 0.0000025.
+# disc_affix=slow3 is with effective_learning_rate=0.000005 and last_layer_factor=0.1
+
+disc_affix=slow3
 
 dir=${srcdir}_${criterion}${disc_affix}
 
@@ -45,11 +49,17 @@ dir=${srcdir}_${criterion}${disc_affix}
 ## so it can split utterances without much gap or overlap.
 frames_per_eg=300,280,150,120,100
 frames_overlap_per_eg=0
-frames_per_chunk_decoding=200
+frames_per_chunk_egs=200  # for alignments and denlat creation.
+frames_per_chunk_decoding=50  # for decoding; should be the same as the value
+                              # used in the script that trained the nnet.
+                              # We didn't set the frames_per_chunk in
+                              # run_tdnn_lstm_1b.sh, so it defaults to 50.
 ## these context options should match the training condition. (chunk_left_context,
 ## chunk_right_context)
 ## We set --extra-left-context-initial 0 and --extra-right-context-final 0
 ## directly in the script below, but this should also match the training condition.
+## note: --extra-left-context should be the same as the chunk_left_context (or in
+## general, the argument of --egs.chunk-left-context) in the baseline script.
 extra_left_context=40
 extra_right_context=0
 
@@ -57,6 +67,7 @@ extra_right_context=0
 
 ## Nnet training options
 effective_learning_rate=0.000005
+last_layer_factor=0.1
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=2
@@ -65,8 +76,6 @@ regularization_opts=          # Applicable for providing --xent-regularize and -
 minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
                                       # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
 
-last_layer_factor=0.1         # prevent the final layer from learning too fast;
-                              # this can be a problem.
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -95,7 +104,7 @@ if [ $stage -le 1 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
-    --frames-per-chunk $frames_per_chunk_decoding \
+    --frames-per-chunk $frames_per_chunk_egs \
     --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
     --online-ivector-dir $online_ivector_dir \
@@ -118,7 +127,7 @@ if [ -z "$degs_dir" ]; then
       --extra-left-context $extra_left_context \
       --extra-right-context $extra_right_context \
       --extra-left-context-initial 0 --extra-right-context-final 0 \
-      --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
       --stage $get_egs_stage \
       --online-ivector-dir $online_ivector_dir \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
@@ -131,11 +140,11 @@ if [ $stage -le 3 ]; then
   steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
     --stage $train_stage \
     --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --last-layer-factor $last_layer_factor \
     --criterion $criterion --drop-frames true \
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
@@ -149,6 +158,7 @@ if [ $stage -le 4 ]; then
         --extra-left-context $extra_left_context \
         --extra-right-context $extra_right_context \
         --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --frames-per-chunk "$frames_per_chunk_decoding" \
         --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
 
@@ -172,5 +182,4 @@ if [ $stage -le 5 ] && $cleanup; then
   steps/nnet2/remove_egs.sh ${srcdir}_degs || true
 fi
 
-
 exit 0;
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
index 3fffd59426c..124b04949a0 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -54,8 +54,6 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -141,7 +139,6 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
@@ -158,8 +155,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
index b84688f574c..01e1476befb 100755
--- a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -46,8 +46,6 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -125,7 +123,6 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
@@ -142,8 +139,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --adjust-priors $adjust_priors \
-    --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
diff --git a/egs/wsj/s5/steps/nnet2/train_tanh.sh b/egs/wsj/s5/steps/nnet2/train_tanh.sh
index d4ec6412be9..7568da320ee 100755
--- a/egs/wsj/s5/steps/nnet2/train_tanh.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 
@@ -15,7 +15,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -66,7 +66,7 @@ egs_dir=
 lda_opts=
 egs_opts=
 transform_dir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # can be used to force "raw" feature type.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -122,7 +122,7 @@ if [ $# != 4 ]; then
   echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -139,7 +139,7 @@ done
 
 # Set some variables.
 num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
- 
+
 nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
 # in this dir we'll have just one job.
 sdata=$data/split$nj
@@ -210,7 +210,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 TanhComponent dim=$hidden_layer_dim
@@ -267,7 +267,7 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
@@ -295,17 +295,17 @@ while [ $x -lt $num_iters ]; do
     last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     # The last two layers will get this (usually lower) learning rate.
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -327,7 +327,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -411,7 +411,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index fdf8130ec62..4c3b0987562 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -135,10 +135,19 @@ tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   # e.g. for 'chain' systems
-  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
+  if [ "$frame_subsampling_factor" -gt 1 ] && \
+     [ "$scale_opts" == "--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" ]; then
+    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
+    echo "...  but the scale opts are the defaults.  You probably want"
+    echo "--scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0'"
+    sleep 1
+  fi
 fi
 
+
 $cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \
   compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
   nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 65704fe9894..f9737b4c8f4 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -49,9 +49,9 @@ num_utts_subset=80     # number of utterances in validation and training
                        # subsets used for diagnostics.
 num_egs_subset=800     # number of egs (maximum) for the validation and training
                        # subsets used for diagnostics.
-frames_per_iter=400000 # each iteration of training, see this many frames
-                       # per job.  This is just a guideline; it will pick a number
-                       # that divides the number of samples in the entire data.
+frames_per_iter=1000000 # each iteration of training, see this many frames
+                        # per job.  This is just a guideline; it will pick a number
+                        # that divides the number of samples in the entire data.
 cleanup=true
 
 stage=0
@@ -201,10 +201,20 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   # e.g. for 'chain' systems
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
+  if [ $frame_subsampling_factor -ne 1 ] && [ "$self_loop_scale" == "0.1" ]; then
+    echo "$0: warning: frame_subsampling_factor is not 1 (so likely a chain system),"
+    echo "...  but self-loop-scale is 0.1.  Make sure this is not a mistake."
+    sleep 1
+  fi
 else
   frame_subsampling_factor=1
 fi
 
+if [ "$self_loop_scale" == "1.0" ] && [ "$acwt" == 0.1 ]; then
+  echo "$0: warning: you set --self-loop-scale=1.0 (so likely a chain system)",
+  echo " ... but the acwt is still 0.1 (you probably want --acwt 1.0)"
+  sleep 1
+fi
 
 ## Make the decoding graph.
 if [ $stage -le 0 ]; then
@@ -270,6 +280,30 @@ cp $lang/phones/silence.csl $dir/info/
 # of archives we assume that this will be the average number of frames per eg.
 frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
 
+
+# read 'mof' as max_open_filehandles.
+# When splitting up the scp files, we don't want to have to hold too many
+# files open at once.  If the number of archives we have to write exceeds
+# 256 (or less if unlimit -n is smaller), we split in two stages.
+mof=$(ulimit -n) || exit 1
+# the next step helps work around inconsistency between different machines on a
+# cluster.  It's unlikely that the allowed number of open filehandles would ever
+# be less than 256.
+if [ $mof -gt 256 ]; then mof=256; fi
+# allocate mof minus 3 for the max allowed outputs, because of
+# stdin,stderr,stdout.  this will normally come to 253.  We'll do a two-stage
+# splitting if the needed number of scp files is larger than this.
+num_groups=$[(num_archives+(mof-3)-1)/(mof-3)]
+group_size=$[(num_archives+num_groups-1)/num_groups]
+if [ $num_groups -gt 1 ]; then
+  new_num_archives=$[group_size*num_groups]
+  [ $new_num_archives -ne $num_archives ] && \
+    echo "$0: rounding up num-archives from $num_archives to $new_num_archives for easier splitting"
+  num_archives=$new_num_archives
+  echo $new_num_archives >$dir/info/num_archives
+fi
+
+
 if [ -e $dir/storage ]; then
   # Make soft links to storage directories, if distributing this way..  See
   # utils/create_split_dir.pl.
@@ -336,7 +370,8 @@ if [ $stage -le 3 ]; then
     $dir/dengraph/HCLG.fst "$feats" ark:- \| \
     $lattice_determinize_cmd  \| \
     nnet3-discriminative-get-egs --acoustic-scale=$acwt --compress=$compress \
-      --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg \
+      $frame_subsampling_opt --num-frames=$frames_per_eg \
+      --num-frames-overlap=$frames_overlap_per_eg \
       $ivector_opts $context_opts \
       $dir/final.mdl "$feats"  "ark,s,cs:-" \
       "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
@@ -390,26 +425,6 @@ if [ $stage -le 4 ]; then
 fi
 
 
-# read 'mof' as max_open_filehandles.
-# When splitting up the scp files, we don't want to have to hold too many
-# files open at once.
-mof=$(ulimit -n) || exit 1
-# the next step helps work around inconsistency between different machines on a
-# cluster.  It's unlikely that the allowed number of open filehandles would ever
-# be less than 256.
-if [ $mof -gt 256 ]; then mof=256; fi
-# allocate mof minus 3 for the max allowed outputs, because of
-# stdin,stderr,stdout.  this will normally come to 253.  We'll do a two-stage
-# splitting if the needed number of scp files is larger than this.
-num_groups=$[(num_archives+(mof-3)-1)/(mof-3)]
-group_size=$[(num_archives+num_groups-1)/num_groups]
-if [ $num_groups -gt 1 ]; then
-  new_num_archives=$[group_size*num_groups]
-  [ $new_num_archives -ne $num_archives ] && \
-    echo "$0: rounding up num-archives from $num_archives to $new_num_archives for easier splitting"
-  echo $new_num_archives >$dir/info/num_archives
-fi
-
 
 # function/pseudo-command to randomly shuffle input lines using a small buffer size
 function shuffle {
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index bf9bcd1d45c..dddef38573e 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -48,40 +48,34 @@ def get_args():
     parser = argparse.ArgumentParser(
         description="""Parses the training logs and generates a variety of
         plots.
-        e.g. (deprecated): steps/nnet3/report/generate_plots.py \\
+        e.g.: steps/nnet3/report/generate_plots.py \\
         --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 \\
-        exp/nnet3/tdnn exp/nnet3/tdnn/report
-        e.g. (current): steps/nnet3/report/generate_plots.py \\
-        exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report""")
+        exp/nnet3/tdnn exp/nnet3/tdnn/report""")
 
     parser.add_argument("--comparison-dir", type=str, action='append',
                         help="other experiment directories for comparison. "
-                        "These will only be used for plots, not tables"
-                        "Note: this option is deprecated.")
+                        "These will only be used for plots, not tables")
     parser.add_argument("--start-iter", type=int,
                         help="Iteration from which plotting will start",
                         default=1)
     parser.add_argument("--is-chain", type=str, default=False,
                         action=common_lib.StrToBoolAction,
-                        help="Iteration from which plotting will start")
+                        help="True if directory contains chain models")
     parser.add_argument("--output-nodes", type=str, default=None,
                         action=common_lib.NullstrToNoneAction,
                         help="""List of space separated
                         <output-node>:<objective-type> entities,
                         one for each output node""")
-    parser.add_argument("exp_dir", nargs='+',
-                        help="the first dir is the experiment directory, "
-                        "e.g. exp/nnet3/tdnn, the rest dirs (if exist) "
-                        "are other experiment directories for comparison.")
+    parser.add_argument("exp_dir",
+                        help="experiment directory, e.g. exp/nnet3/tdnn")
     parser.add_argument("output_dir",
                         help="experiment directory, "
                         "e.g. exp/nnet3/tdnn/report")
 
     args = parser.parse_args()
-    if (args.comparison_dir is not None and len(args.comparison_dir) > 6) or \
-    (args.exp_dir is not None and len(args.exp_dir) > 7):
+    if args.comparison_dir is not None and len(args.comparison_dir) > 6:
         raise Exception(
-            """max 6 comparison directories can be specified.
+            """max 6 --comparison-dir options can be specified.
             If you want to compare with more comparison_dir, you would have to
             carefully tune the plot_colors variable which specified colors used
             for plotting.""")
@@ -156,10 +150,10 @@ def latex_compliant_name(name_string):
     return node_name_string
 
 
-def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
-        file_basename='accuracy', comparison_dir=None,
-        start_iter=1, latex_report=None, output_name='output'):
-
+def generate_accuracy_plots(exp_dir, output_dir, plot, key='accuracy',
+                            file_basename='accuracy', comparison_dir=None,
+                            start_iter=1,
+                            latex_report=None, output_name='output'):
     assert start_iter >= 1
 
     if plot:
@@ -170,20 +164,22 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
     dirs = [exp_dir] + comparison_dir
     index = 0
     for dir in dirs:
-        [report, times, data] = log_parse.generate_acc_logprob_report(dir, key,
-                output_name)
+        [accuracy_report, accuracy_times,
+         accuracy_data] = log_parse.generate_accuracy_report(dir, key,
+                                                             output_name)
         if index == 0:
             # this is the main experiment directory
             with open("{0}/{1}.log".format(output_dir,
                                            file_basename), "w") as f:
-                f.write(report)
+                f.write(accuracy_report)
 
         if plot:
             color_val = g_plot_colors[index]
-            data = np.array(data)
+            data = np.array(accuracy_data)
             if data.shape[0] == 0:
-                raise Exception("Couldn't find any rows for the"
-                        "accuracy/log-probability plot")
+                logger.warning("Couldn't find any rows for the accuracy plot, "
+                               "not generating it.");
+                return
             data = data[data[:, 0] >= start_iter, :]
             plot_handle, = plt.plot(data[:, 0], data[:, 1], color=color_val,
                                     linestyle="--",
@@ -594,28 +590,28 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
     for (output_name, objective_type) in output_names:
         if objective_type == "linear":
             logger.info("Generating accuracy plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot, key='accuracy',
                 file_basename='accuracy', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
 
             logger.info("Generating log-likelihood plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot, key='log-likelihood',
                 file_basename='loglikelihood', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "chain":
             logger.info("Generating log-probability plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot,
                 key='log-probability', file_basename='log_probability',
                 comparison_dir=comparison_dir, start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         else:
             logger.info("Generating " + objective_type + " objective plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot, key='objective',
                 file_basename='objective', comparison_dir=comparison_dir,
                 start_iter=start_iter,
@@ -660,18 +656,9 @@ def main():
     else:
         output_nodes.append(('output', 'linear'))
 
-    if args.comparison_dir is not None:
-      generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
-                     comparison_dir=args.comparison_dir,
-                     start_iter=args.start_iter)
-    else:
-      if len(args.exp_dir) == 1:
-        generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
-                       start_iter=args.start_iter)
-      if len(args.exp_dir) > 1:
-        generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
-                       comparison_dir=args.exp_dir[1:],
-                       start_iter=args.start_iter)
+    generate_plots(args.exp_dir, args.output_dir, output_nodes,
+                   comparison_dir=args.comparison_dir,
+                   start_iter=args.start_iter)
 
 
 if __name__ == "__main__":
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index 77198a00576..bdee5a54e4d 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -192,8 +192,18 @@ if [ $stage -le -1 ]; then
     echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
   fi
 
+
+  # set the learning rate to $learning_rate, and
+  # set the output-layer's learning rate to
+  # $learning_rate times $last_layer_factor.
+  edits_str="set-learning-rate learning-rate=$learning_rate"
+  if [ "$last_layer_factor" != "1.0" ]; then
+    last_layer_lrate=$(perl -e "print ($learning_rate*$last_layer_factor);") || exit 1
+    edits_str="$edits_str; set-learning-rate name=output.affine learning-rate=$last_layer_lrate"
+  fi
+
   $cmd $dir/log/convert.log \
-    nnet3-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1;
+    nnet3-am-copy --edits="$edits_str" "$src_model" $dir/0.mdl || exit 1;
 
   ln -sf 0.mdl $dir/epoch0.mdl
 fi
diff --git a/egs/wsj/s5/steps/shift_feats.sh b/egs/wsj/s5/steps/shift_feats.sh
index 9ad85368c3f..22b17f2cb09 100755
--- a/egs/wsj/s5/steps/shift_feats.sh
+++ b/egs/wsj/s5/steps/shift_feats.sh
@@ -5,9 +5,10 @@
 
 # This script shifts the feats in the input data directory and creates a
 # new directory <input-data>_fs<num-frames-shift> with shifted feats.
-# If the shift is negative, the initial frames get truncated.
-# If the shift is positive, the first frame is repeated.
-# Usually applicable for sequence training
+# If the shift is negative, the initial frames get truncated and the
+# last frame repeated; if positive, vice versa.
+# Used to prepare data for sequence training of models with
+# frame_subsampling_factor != 1 (e.g. chain models).
 
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
@@ -82,4 +83,3 @@ if [ $nf -ne $nu ]; then
 fi
 
 echo "Succeeded shifting features for $name into $data"
-
diff --git a/egs/wsj/s5/utils/data/limit_feature_dim.sh b/egs/wsj/s5/utils/data/limit_feature_dim.sh
index 4e64e68d7c7..2d969ee569b 100755
--- a/egs/wsj/s5/utils/data/limit_feature_dim.sh
+++ b/egs/wsj/s5/utils/data/limit_feature_dim.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-77;20003;0c
+
 # Copyright 2016  Alibaba Robotics Corp. (author: Xingyu Na)
 # Apache 2.0
 

From 496eec57583e42891343d396b4fa190d39471b55 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 26 Jan 2017 00:18:52 -0500
Subject: [PATCH 124/213] [egs][scripts] Adding more example scripts for
 Tedlium and Swbd; add nnet3_disc_dir_info.pl.

---
 .../s5c/local/chain/compare_wer_general.sh    |   5 +
 .../s5c/local/nnet3/compare_wer_general.sh    |  67 +++-
 .../local/nnet3/run_blstm_discriminative.sh   |   4 +-
 egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh     |   1 +
 .../local/nnet3/run_tdnn_discriminative.sh    | 175 ---------
 .../s5c/local/nnet3/tuning/run_tdnn_d_disc.sh |  55 ++-
 .../s5_r2/local/chain/compare_wer_general.sh  |  57 ++-
 .../s5_r2/local/chain/tuning/run_tdnn_1d.sh   | 256 +++++++++++++
 .../chain/tuning/run_tdnn_lstm_1e_disc.sh     | 264 ++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1i.sh    | 337 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1j.sh    | 334 +++++++++++++++++
 egs/tedlium/s5_r2/local/nnet3/compare_wer.sh  |  75 +++-
 .../s5_r2/local/nnet3/run_tdnn_lstm.sh        |   1 +
 .../s5_r2/local/nnet3/run_tdnn_lstm_disc.sh   |   1 +
 .../s5_r2/local/nnet3/tuning/run_tdnn_1a.sh   | 120 +++++++
 .../s5_r2/local/nnet3/tuning/run_tdnn_1b.sh   | 169 +++++++++
 .../local/nnet3/tuning/run_tdnn_lstm_1a.sh    | 228 ++++++++++++
 .../local/nnet3/tuning/run_tdnn_lstm_1b.sh    | 240 +++++++++++++
 .../local/nnet3/tuning/run_tdnn_lstm_1c.sh    | 234 ++++++++++++
 egs/wsj/s5/steps/info/chain_dir_info.pl       |   4 +-
 egs/wsj/s5/steps/info/nnet2_dir_info.pl       |   4 +-
 egs/wsj/s5/steps/info/nnet3_dir_info.pl       |   4 +-
 egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl  | 172 +++++++++
 23 files changed, 2592 insertions(+), 215 deletions(-)
 create mode 120000 egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh
 delete mode 100755 egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
 create mode 120000 egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm.sh
 create mode 120000 egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_disc.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1a.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
 create mode 100755 egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl

diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
index c8aae0b3b94..1b1f0d16047 100755
--- a/egs/swbd/s5c/local/chain/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
 
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer_general.sh tdnn_7h_sp tdnn_7i_sp
+
+echo "$0 $*";  # print command line.
+
 echo -n "System               "
 for x in $*; do   printf "% 10s" $x;   done
 echo
diff --git a/egs/swbd/s5c/local/nnet3/compare_wer_general.sh b/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
index 11742173120..37eaeeac85b 100755
--- a/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
@@ -1,48 +1,99 @@
 #!/bin/bash
 
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/nnet3/compare_wer_general.sh tdnn_c_sp tdnn_d_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:1 tdnn_d_sp_smbr:2 ...
+
+echo "# $0 $*";  # print command line.
+
+
 echo -n "# System               "
-for x in $*; do   printf "% 10s" $x;   done
+for x in $*; do   printf " % 9s" $x;   done
 echo
 
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free name, like:
+#  set_names tdnn_a_sp
+# it will set dir=exp/nnet3/tdnn_a_sp and epoch_suffix=""
+# If called with something like:
+#  set_names tdnn_d_sp_smbr:3
+# it will set dir=exp/nnet3/tdnn_d_sp_smbr and epoch_suffix="epoch3"
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  name=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  dirname=exp/nnet3/$name
+  if [ -z $epoch ]; then
+    epoch_suffix=""
+  else
+    used_epochs=true
+    epoch_suffix=_epoch${epoch}
+  fi
+}
+
+
 echo -n "# WER on train_dev(tg) "
 for x in $*; do
-  wer=$(grep WER exp/nnet3/${x}_sp/decode_train_dev_hires_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  # note: the '*' in the directory name is because there
+  # is _hires_ in there for the cross-entropy systems, and
+  # nothing for the sequence trained systems.
+  wer=$(grep WER $dirname/decode_train_dev*sw1_tg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
 echo -n "# WER on train_dev(fg) "
 for x in $*; do
-  wer=$(grep WER exp/nnet3/${x}_sp/decode_train_dev_hires_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep WER $dirname/decode_train_dev*sw1_fsh_fg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
 echo -n "# WER on eval2000(tg)  "
 for x in $*; do
-  wer=$(grep Sum exp/nnet3/${x}_sp/decode_eval2000_hires_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
 echo -n "# WER on eval2000(fg)  "
 for x in $*; do
-  wer=$(grep Sum exp/nnet3/${x}_sp/decode_eval2000_hires_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
+if $used_epochs; then
+  # we don't print the probs in this case.
+  exit 0
+fi
+
 echo -n "# Final train prob     "
 for x in $*; do
-  prob=$(grep log-likelihood exp/nnet3/${x}_sp/log/compute_prob_train.combined.log | awk '{print $8}')
+  set_names $x
+  prob=$(grep log-likelihood $dirname/log/compute_prob_train.combined.log | awk '{print $8}')
   printf "% 10s" $prob
 done
 echo
 
 echo -n "# Final valid prob     "
 for x in $*; do
-  prob=$(grep log-likelihood exp/nnet3/${x}_sp/log/compute_prob_valid.combined.log | awk '{print $8}')
+  set_names $x
+  prob=$(grep log-likelihood $dirname/log/compute_prob_valid.combined.log | awk '{print $8}')
   printf "% 10s" $prob
 done
 echo
-
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index c6dfb0107cd..ba751ad8732 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -2,7 +2,9 @@
 
 set -o pipefail
 set -e
-# this is run_discriminative.sh
+
+# Caution: this script is out of date, it does not use the
+# refactored discriminative training script with get_degs.sh.
 
 # This script does discriminative training on top of CE BLSTM system.
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh
new file mode 120000
index 00000000000..e4d47deb7a4
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_d_disc.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
deleted file mode 100755
index 7af311e7ff4..00000000000
--- a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
+++ /dev/null
@@ -1,175 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -e
-# this is run_discriminative.sh
-
-# This script does discriminative training on top of CE nnet3 system.
-# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
-# since the lattice generation runs in about real-time, so takes of the order of
-# 1000 hours of CPU time.
-#
-. cmd.sh
-
-
-stage=0
-train_stage=-10 # can be used to start training in the middle.
-get_egs_stage=-10
-use_gpu=true  # for training
-cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
-               # alignments and degs).
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-srcdir=exp/nnet3/nnet_ms_a
-train_data_dir=data/train_nodup_sp_hires
-online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
-degs_dir=                     # If provided, will skip the degs directory creation
-lats_dir=                     # If provided, will skip denlats creation
-
-## Objective options
-criterion=smbr
-one_silence_class=true
-
-dir=${srcdir}_${criterion}
-
-## Egs options
-frames_per_eg=150
-frames_overlap_per_eg=30
-
-## Nnet training options
-effective_learning_rate=0.0000125
-max_param_change=1
-num_jobs_nnet=4
-num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
-minibatch_size=64
-
-## Decode options
-decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
-
-if $use_gpu; then
-  if ! cuda-compiled; then
-    cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
-EOF
-  fi
-  num_threads=1
-else
-  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
-  # almost the same, but this may be a little bit slow.
-  num_threads=16
-fi
-
-if [ ! -f ${srcdir}/final.mdl ]; then
-  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
-  exit 1;
-fi
-
-if [ $stage -le 1 ]; then
-  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
-  # get excellent GPU utilization though.]
-  nj=350 # have a high number of jobs because this could take a while, and we might
-         # have some stragglers.
-  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
-    --online-ivector-dir $online_ivector_dir \
-     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
-
-fi
-
-if [ -z "$lats_dir" ]; then
-  lats_dir=${srcdir}_denlats
-  if [ $stage -le 2 ]; then
-    nj=50
-    # this doesn't really affect anything strongly, except the num-jobs for one of
-    # the phases of get_egs_discriminative.sh below.
-    num_threads_denlats=6
-    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
-    # total slots = 80 * 6 = 480.
-    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
-      --online-ivector-dir $online_ivector_dir \
-      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
-      $train_data_dir data/lang $srcdir ${lats_dir} ;
-  fi
-fi
-
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
-
-left_context=$[model_left_context + extra_left_context]
-right_context=$[model_right_context + extra_right_context]
-
-frame_subsampling_opt=
-if [ -f $srcdir/frame_subsampling_factor ]; then
-  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
-fi
-
-cmvn_opts=`cat $srcdir/cmvn_opts`
-
-if [ -z "$degs_dir" ]; then
-  degs_dir=${srcdir}_degs
-
-  if [ $stage -le 3 ]; then
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
-      utils/create_split_dir.pl \
-        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
-    fi
-    # have a higher maximum num-jobs if
-    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
-
-    steps/nnet3/get_egs_discriminative.sh \
-      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --online-ivector-dir $online_ivector_dir \
-      --left-context $left_context --right-context $right_context \
-      $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
-      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
-  fi
-fi
-
-if [ $stage -le 4 ]; then
-  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
-    --stage $train_stage \
-    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
-    --criterion $criterion --drop-frames true \
-    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
-    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
-    --regularization-opts "$regularization_opts" \
-    ${degs_dir} $dir
-fi
-
-graph_dir=exp/tri4/graph_sw1_tg
-if [ $stage -le 5 ]; then
-  for x in `seq $decode_start_epoch $num_epochs`; do
-    for decode_set in train_dev eval2000; do
-      (
-      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch${x}_adj
-
-      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
-        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
-      if $has_fisher; then
-        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-          $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ;
-      fi
-      ) &
-    done
-  done
-fi
-wait;
-
-if [ $stage -le 6 ] && $cleanup; then
-  # if you run with "--cleanup true --stage 6" you can clean up.
-  rm ${lats_dir}/lat.*.gz || true
-  rm ${srcdir}_ali/ali.*.gz || true
-  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
index 22f4004c056..6f42e042166 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
@@ -9,6 +9,52 @@
 # 1000 hours of CPU time.
 
 
+
+# Below is with the current effective_learning_rate=0.00000125.  This was run
+# with 4 epochs, but the script is currently set to run for 3 epochs, and the
+# 'slow2' affix is removed.
+
+# steps/info/nnet3_disc_dir_info.pl exp/nnet3/tdnn_d_sp_smbrslow2
+# exp/nnet3/tdnn_d_sp_smbrslow2:num-jobs=4;effective-lrate=1.25e-06;iters-per-epoch=194;epoch[0,1,2,3,4]:train-objf=[0.87,0.91,0.91,0.91,0.92],valid-objf=[0.85,0.86,0.87,0.87,0.87],train-counts=[1.27,0.92,0.79,0.72,0.68],valid-counts=[1.11,0.80,0.74,0.67,0.65]
+#
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbrslow2:{1,2,3,4}_adj
+# System                tdnn_d_sp tdnn_d_sp_smbrslow2:1_adj tdnn_d_sp_smbrslow2:2_adj tdnn_d_sp_smbrslow2:3_adj tdnn_d_sp_smbrslow2:4_adj
+# WER on train_dev(tg)      16.51     15.12     15.02     14.89     14.87
+# WER on train_dev(fg)      15.34     13.80     13.64     13.61     13.62
+# WER on eval2000(tg)        19.2      17.8      17.7      17.6      17.8
+# WER on eval2000(fg)        17.7      16.3      16.1      16.2      16.4
+
+# Below is when it was run with learning-rate 0.0000025.  It was best after 2 epochs.
+
+# exp/nnet3/tdnn_d_sp_smbrslow:num-jobs=4;effective-lrate=2.5e-06;iters-per-epoch=194;epoch[0,1,2,3]:train-objf=[0.87,0.91,0.91,0.92],valid-objf=[0.85,0.87,0.87,0.87],train-counts=[1.27,0.80,0.73,0.65],valid-counts=[1.11,0.72,0.65,0.63]
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbrslow:{1,2,3}_adj
+# System                tdnn_d_sp tdnn_d_sp_smbrslow:1_adj tdnn_d_sp_smbrslow:2_adj tdnn_d_sp_smbrslow:3_adj
+# WER on train_dev(tg)      16.51     15.01     14.89     14.84
+# WER on train_dev(fg)      15.34     13.69     13.61     13.58
+# WER on eval2000(tg)        19.2      17.7      17.8      17.8
+# WER on eval2000(fg)        17.7      16.2      16.4      16.5
+
+# Below is when it was run with learning-rate 0.000005.  It was best after 1st epoch.
+
+# steps/info/nnet3_disc_dir_info.pl exp/nnet3/tdnn_d_sp_smbr
+# exp/nnet3/tdnn_d_sp_smbr:num-jobs=4;effective-lrate=5e-06;iters-per-epoch=194;epoch[0,1,2,3]:train-objf=[0.87,0.91,0.92,0.93],valid-objf=[0.85,0.87,0.87,0.88],train-counts=[1.27,0.67,0.67,0.50],valid-counts=[1.11,0.64,0.61,0.58]
+
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:{1,2,3}_adj
+# System                tdnn_d_sp tdnn_d_sp_smbr:1_adj tdnn_d_sp_smbr:2_adj tdnn_d_sp_smbr:3_adj
+# WER on train_dev(tg)      16.51     14.94     14.85     14.91
+# WER on train_dev(fg)      15.34     13.66     13.76     13.77
+# WER on eval2000(tg)        19.2      17.7      17.9      18.1
+# WER on eval2000(fg)        17.7      16.2      16.5      16.6
+
+# below is with learning-rate 0.000005, showing results without prior-adjustment (the prior-adjustment
+# helps).
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:{1,2,3}
+# System                tdnn_d_sp tdnn_d_sp_smbr:1 tdnn_d_sp_smbr:2 tdnn_d_sp_smbr:3
+# WER on train_dev(tg)      16.51     15.06     15.05     15.04
+# WER on train_dev(fg)      15.34     13.88     13.92     13.85
+# WER on eval2000(tg)        19.2      17.9      18.1      18.2
+# WER on eval2000(fg)        17.7      16.4      16.7      16.9
+
 set -e
 set -uo pipefail
 
@@ -41,8 +87,11 @@ one_silence_class=true
 # reran by mistake with no affix with effective_learning_rate=0.000005 [was a bit
 # better, see NOTES, but still best after 1st epoch].
 # reran again with affix=slow and effective_learning_rate=0.0000025
-
-disc_affix=slow
+# reran again with affix=slow2 and effective_learning_rate=0.00000125 (this was
+# about the best).
+# before checking in the script, removed the slow2 affix but left with
+# the lowest learning rate.
+disc_affix=
 
 dir=${srcdir}_${criterion}${disc_affix}
 
@@ -62,7 +111,7 @@ extra_right_context=0
 
 
 ## Nnet training options
-effective_learning_rate=0.0000025
+effective_learning_rate=0.00000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=3
diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
index b8988fc8d1a..21ba1720e3a 100755
--- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -1,7 +1,13 @@
 #!/bin/bash
 
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3}
 
-echo $0 $*
+
+echo "# $0 $*"
 
 include_looped=false
 if [ "$1" == "--looped" ]; then
@@ -9,24 +15,57 @@ if [ "$1" == "--looped" ]; then
   shift
 fi
 
-echo -n "System               "
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain_cleaned/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
 for x in $*; do   printf "% 10s" " $(basename $x)";   done
 echo
 
-dirnames=(dev dev_rescore test test_rescore)
-strings=("WER on dev(orig)     " "WER on dev(rescored) " "WER on test(orig)    " "WER on test(rescored)")
+strings=("# WER on dev(orig)     " "# WER on dev(rescored) " "# WER on test(orig)    " "# WER on test(rescored)")
 
 for n in 0 1 2 3; do
    echo -n "${strings[$n]}"
    for x in $*; do
-     wer=$(grep Sum $x/decode_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
      printf "% 10s" $wer
    done
    echo
    if $include_looped; then
-     echo -n "        [looped:]    "
+     echo -n "#         [looped:]    "
      for x in $*; do
-       wer=$(grep Sum $x/decode_looped_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
        printf "% 10s" $wer
      done
      echo
@@ -34,6 +73,10 @@ for n in 0 1 2 3; do
 done
 
 
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
 echo -n "Final train prob     "
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..99921a9bf61
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+
+
+# run_tdnn_1d.sh is like run_tdnn_1b.sh but using 10 times the self-repair
+# scale on the 1st TDNN layer.
+# seems a little better- I wouldn't say it was significant normally, but
+# it definitely stops the 1st TDNN layer from having under/over-saturated
+# neurons.
+
+# exp/chain_cleaned/tdnn1b_sp_bi: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3606 combine=-0.10->-0.10 xent:train/valid[167,252,final]=(-1.47,-1.40,-1.40/-1.61,-1.57,-1.56) logprob:train/valid[167,252,final]=(-0.096,-0.087,-0.087/-0.119,-0.115,-0.115)
+# exp/chain_cleaned/tdnn1d_sp_bi: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3606 combine=-0.10->-0.10 xent:train/valid[167,252,final]=(-1.46,-1.39,-1.39/-1.61,-1.56,-1.55) logprob:train/valid[167,252,final]=(-0.096,-0.088,-0.088/-0.120,-0.115,-0.115)
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1b_sp_bi exp/chain_cleaned/tdnn1d_sp_bi
+# System                tdnn1b_sp_bi tdnn1d_sp_bi
+# WER on dev(orig)            9.4       9.5
+# WER on dev(rescored)        8.8       8.6
+# WER on test(orig)           9.6       9.4
+# WER on test(rescored)       9.0       8.9
+# Final train prob        -0.0870   -0.0878
+# Final valid prob        -0.1147   -0.1152
+# Final train prob (xent)   -1.4014   -1.3921
+# Final valid prob (xent)   -1.5634   -1.5543
+
+# run_tdnn_1b.sh is like run_tdnn_1a.sh but upgrading to xconfig-based
+# config generation.
+
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
+# to get the configuration.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450 self-repair-scale=1.0e-04
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
new file mode 100755
index 00000000000..0d64c75aea8
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+# This script does discriminative training on top of the 1e chain system.  To
+# simplify things, this assumes you are using the "cleaned" data (since this is
+# generally better), i.e. it won't work if you used options to run_tdnn_lstm_1e.sh
+# to use the non-cleaned data.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+
+# Below is with 0.00002 and last_layer_factor=0.5
+# this is the setting we're leaving in the script, but the discriminative training
+# is not really helping.  Maybe we should try the frame-shifted version.
+# steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2:num-jobs=4;effective-lrate=2e-05;last-layer-factor=0.50;iters-per-epoch=138;epoch[0,1,2]:train-objf=[0.94,0.96,0.97],valid-objf=[0.95,0.96,0.96],train-counts=[0.24,0.12,0.10],valid-counts=[0.28,0.20,0.17]
+# b01:s5_r2: steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2:{1,2}
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_smbroutslow2:1 tdnn_lstm1e_sp_bi_smbroutslow2:2
+# WER on dev(orig)            9.0       8.9       8.9
+#         [looped:]           9.0       8.9       8.9
+# WER on dev(rescored)        8.4       8.3       8.4
+#         [looped:]           8.4       8.3       8.4
+# WER on test(orig)           8.8       8.7       8.8
+#         [looped:]           8.8       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.4
+#         [looped:]           8.3       8.4       8.5
+
+
+
+# Below is with 0.00002 and last_layer_factor=1.0.
+# b01:s5_r2: steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr:num-jobs=4;lrate=2e-05;iters-per-epoch=138;epoch[0,1,2]:train-objf=[0.94,0.96,0.97],valid-objf=[0.95,0.96,0.96],train-counts=[0.24,0.12,0.09],valid-counts=[0.28,0.19,0.16]
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr:{1,2}
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_smbr:1 tdnn_lstm1e_sp_bi_smbr:2
+# WER on dev(orig)            9.0       8.8       8.9
+#         [looped:]           9.0       8.9       8.9
+# WER on dev(rescored)        8.4       8.3       8.4
+#         [looped:]           8.4       8.3       8.4
+# WER on test(orig)           8.8       8.8       8.9
+#         [looped:]           8.8       8.8       8.9
+# WER on test(rescored)       8.4       8.4       8.5
+#         [looped:]           8.3       8.4       8.5
+
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+# you can set disc_affix if you run different configurations, e.g. --disc-affix "_b"
+disc_affix=
+
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+srcdir=exp/chain_cleaned/tdnn_lstm1e_sp_bi
+graph_dir=$srcdir/graph
+train_data_dir=data/train_cleaned_sp_hires_comb
+online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # frames-per-chunk for decoding in alignment and
+                          # denlat decoding.
+frames_per_chunk_decoding=140  # frames-per-chunk for decoding when we test
+                               # the models.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+extra_left_context=40
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.00002
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=2
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+last_layer_factor=0.5    # have the output layer train slower than the others.. this can
+                         # be helpful.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --self-loop-scale 1.0 --acwt 1.0 \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true  --acoustic-scale 1.0 \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --last-layer-factor $last_layer_factor \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+
+      (
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk "$frames_per_chunk_decoding" \
+         --extra-left-context $extra_left_context \
+         --extra-right-context $extra_right_context \
+         --extra-left-context-initial 0 --extra-right-context-final 0 \
+         --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
+
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang data/lang_rescore data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter} \
+          $dir/decode_${decode_set}_${iter}_rescore || exit 1;
+      ) &
+    done
+  done
+fi
+
+
+
+if [ $stage -le 5 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+      (
+        steps/nnet3/decode_looped.sh \
+          --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+          $graph_dir data/${decode_set}_hires $dir/decode_looped_${decode_set}_${iter} || exit 1;
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+          data/${decode_set}_hires \
+          ${dir}/decode_looped_${decode_set}_${iter} ${dir}/decode_looped_${decode_set}_${iter}_rescore || exit 1
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
new file mode 100755
index 00000000000..62497ca59ff
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -0,0 +1,337 @@
+#!/bin/bash
+
+# 1i is as 1e, but adding boundary-offset.  No clear effect.
+#
+# the 3 columns below are: baseline; boundary-offset with that component
+# learning with 10x the normal learning rate; boundary-offset with
+# regular learning rate.  There seems no clear benefit from this
+# idea.  Reverting the code changes that supported it;
+# see ~dpovey/patches/lstm_boundary.patch
+
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1i_sp_bi exp/chain_cleaned/tdnn_lstm1i_sp_bi.orig_learning_rate
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1i_sp_bi tdnn_lstm1i_sp_bi.orig_learning_rate
+# WER on dev(orig)            9.0       9.1       8.9
+#         [looped:]           9.0       9.0       9.0
+# WER on dev(rescored)        8.4       8.3       8.3
+#         [looped:]           8.4       8.2       8.2
+# WER on test(orig)           8.8       8.9       8.9
+#         [looped:]           8.8       8.9       8.9
+# WER on test(rescored)       8.4       8.4       8.4
+#         [looped:]           8.3       8.4       8.4
+# Final train prob        -0.0648   -0.0625   -0.0644
+# Final valid prob        -0.0827   -0.0833   -0.0855
+# Final train prob (xent)   -0.8372   -0.8129   -0.8286
+# Final valid prob (xent)   -0.9497   -0.9558   -0.9641
+
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1i  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 boundary-offset=true
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 boundary-offset=true
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 boundary-offset=true
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
new file mode 100755
index 00000000000..c9a57f0ab4d
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -0,0 +1,334 @@
+#!/bin/bash
+
+# 1j is as 1e, but adding self-repair-scale=1.0e-04 on 1st tdnn layer [default is 1e-5].
+# It's definitely more effective in preventing under or over-saturated ReLUs, but
+# it's not clear that there is any other benefit.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,j}_sp_bi
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1j_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1j_sp_bi
+# WER on dev(orig)            9.0       9.1
+#         [looped:]           9.0       9.1
+# WER on dev(rescored)        8.4       8.5
+#         [looped:]           8.4       8.5
+# WER on test(orig)           8.8       9.0
+#         [looped:]           8.8       9.1
+# WER on test(rescored)       8.4       8.6
+#         [looped:]           8.3       8.5
+# Final train prob        -0.0648   -0.0646
+# Final valid prob        -0.0827   -0.0835
+# Final train prob (xent)   -0.8372   -0.8296
+# Final valid prob (xent)   -0.9497   -0.9597
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1j  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512 self-repair-scale=1.0e-04
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
index cff39def83b..3e14a4efc55 100755
--- a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
@@ -1,7 +1,13 @@
 #!/bin/bash
 
+# this script is used for comparing decoding results between systems.
+# e.g. local/nnet3/compare_wer_general.sh exp/nnet3_cleaned/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_c_sp exp/nnet3_cleaned/tdnn_c_sp_smbr:{1,2,3}
 
-echo $0 $*
+
+echo "# $0 $*"
 
 include_looped=false
 if [ "$1" == "--looped" ]; then
@@ -9,24 +15,58 @@ if [ "$1" == "--looped" ]; then
   shift
 fi
 
-echo -n "System               "
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain_cleaned/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
 for x in $*; do   printf "% 10s" " $(basename $x)";   done
 echo
 
-dirnames=(dev dev_rescore test test_rescore)
-strings=("WER on dev(orig)     " "WER on dev(rescored) " "WER on test(orig)    " "WER on test(rescored)")
+strings=("# WER on dev(orig)     " "# WER on dev(rescored) " "# WER on test(orig)    " "# WER on test(rescored)")
 
 for n in 0 1 2 3; do
    echo -n "${strings[$n]}"
    for x in $*; do
-     wer=$(grep Sum $x/decode_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
      printf "% 10s" $wer
    done
    echo
    if $include_looped; then
-     echo -n "        [looped:]    "
+     echo -n "#         [looped:]    "
      for x in $*; do
-       wer=$(grep Sum $x/decode_looped_${dirnames[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
        printf "% 10s" $wer
      done
      echo
@@ -34,30 +74,35 @@ for n in 0 1 2 3; do
 done
 
 
-echo -n "Final train prob     "
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep log-like | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final valid prob     "
+echo -n "# Final valid prob     "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep log-like | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final train acc      "
+echo -n "# Final train acc      "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep accuracy | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final valid acc      "
+echo -n "# Final valid acc      "
 for x in $*; do
-  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep accuracy | awk '{printf("%.4f", $8)}')
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_disc.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_disc.sh
new file mode 120000
index 00000000000..50d28fb91f3
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a_disc.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..80ff91b8606
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+#    This is the standard "tdnn" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+# WER on dev(orig)           11.9      11.7
+# WER on dev(rescored)       11.2      10.9
+# WER on test(orig)          11.6      11.7
+# WER on test(rescored)      11.0      11.0
+# Final train prob        -0.9255   -0.9416
+# Final valid prob        -1.1842   -1.1496
+# Final train acc          0.7245    0.7241
+# Final valid acc          0.6771    0.6788
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1a  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
+remove_egs=true
+relu_dim=850
+num_epochs=3
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/tdnn/train.sh --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 12 \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir ${train_ivector_dir} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmd "$decode_cmd" \
+    --relu-dim "$relu_dim" \
+    --remove-egs "$remove_egs" \
+    $train_data_dir data/lang $ali_dir $dir
+fi
+
+if [ $stage -le 13 ]; then
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..379c8040a27
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+#    This is the standard "tdnn" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1b_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+# WER on dev(orig)           11.9      11.7
+# WER on dev(rescored)       11.2      10.9
+# WER on test(orig)          11.6      11.7
+# WER on test(rescored)      11.0      11.0
+# Final train prob        -0.9255   -0.9416
+# Final valid prob        -1.1842   -1.1496
+# Final train acc          0.7245    0.7241
+# Final valid acc          0.6771    0.6788
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1b  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+relu_dim=850
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=850
+  relu-renorm-layer name=tdnn2 dim=850 input=Append(-1,2)
+  relu-renorm-layer name=tdnn3 dim=850 input=Append(-3,3)
+  relu-renorm-layer name=tdnn4 dim=850 input=Append(-7,2)
+  relu-renorm-layer name=tdnn5 dim=850 input=Append(-3,3)
+  relu-renorm-layer name=tdnn6 dim=850
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..f1502dd2761
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+
+# this is a TDNN+LSTM system; the configuration is similar to
+# local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
+# with 1.5 times larger hidden dimensions.
+
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b_sp
+# WER on dev(orig)           11.0      11.0
+#         [looped:]          11.0      11.1
+# WER on dev(rescored)       10.3      10.3
+#         [looped:]          10.3      10.5
+# WER on test(orig)          10.8      10.6
+#         [looped:]          10.7      10.7
+# WER on test(rescored)      10.1       9.9
+#         [looped:]          10.0      10.0
+# Final train prob     -0.68810.7954-0.68970.7946
+# Final valid prob     -0.77960.7611-0.79890.7582
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=15 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bd results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $graph_dir data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..8b8af6eff78
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+
+# 1b is as 1a, but removing the decay-time option as a baseline.
+
+# the decay-time option does seem to be having the expected interaction with
+# 'looped' decoding, i.e. with the decay-time option we don't get a degradation
+# from looped decoding (if anything, with decay time, looped decoding is a
+# little better than baseline decoding).
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b_sp
+# WER on dev(orig)           11.0      11.0
+#         [looped:]          11.0      11.1
+# WER on dev(rescored)       10.3      10.3
+#         [looped:]          10.3      10.5
+# WER on test(orig)          10.8      10.6
+#         [looped:]          10.7      10.7
+# WER on test(rescored)      10.1       9.9
+#         [looped:]          10.0      10.0
+# Final train prob        -0.6881   -0.6897
+# Final valid prob        -0.7796   -0.7989
+# Final train acc          0.7954    0.7946
+# Final valid acc          0.7611    0.7582
+
+
+
+# this is a TDNN+LSTM system; the configuration is similar to
+# local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
+# with 1.5 times larger hidden dimensions.
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1b
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=15 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # caution: we don't set the --frames-per-chunk here, we just use the
+      # default value of 50, which happens to be suitable because it's
+      # close to the primary chunk_width of 40.
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bd results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $graph_dir data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
new file mode 100755
index 00000000000..1d3b12f2697
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1c.sh is as run_tdnn_lstm_1a.sh, but about 1.5 times larger
+# chunk lengths than 1a.
+# There doesn't seem to be any advantage in the longer chunk lengths.
+
+# this is a TDNN+LSTM system; the configuration is similar to
+# local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
+# with 1.5 times larger hidden dimensions.
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp exp/nnet3_cleaned/tdnn_lstm1c_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b_sp tdnn_lstm1c_sp
+# WER on dev(orig)           11.0      11.0      11.0
+#         [looped:]          11.0      11.1      10.9
+# WER on dev(rescored)       10.3      10.3      10.4
+#         [looped:]          10.3      10.5      10.3
+# WER on test(orig)          10.8      10.6      10.8
+#         [looped:]          10.7      10.7      10.7
+# WER on test(rescored)      10.1       9.9      10.1
+#         [looped:]          10.0      10.0      10.1
+# Final train prob        -0.6881   -0.6897   -0.5998
+# Final valid prob        -0.7796   -0.7989   -0.8542
+# Final train acc          0.7954    0.7946    0.7988
+# Final valid acc          0.7611    0.7582    0.7521
+
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1c
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=60,50,40,30
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=15 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $graph_dir data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl
index b43e1752ee8..1d659b89c89 100755
--- a/egs/wsj/s5/steps/info/chain_dir_info.pl
+++ b/egs/wsj/s5/steps/info/chain_dir_info.pl
@@ -17,8 +17,8 @@
                "This script extracts some important information from the logs\n" .
                "and displays it on a single (rather long) line.\n" .
                "The --debug option is just to debug the script itself.\n" .
-               "This program exits with status 0 if it seems like the argument\n" .
-               "really was a GMM dir, and 1 otherwise.\n";
+               "This program exits with status 0 if it seems like the arguments\n" .
+               "really were of the expected directory type, and 1 otherwise.\n";
   exit(1);
 }
 
diff --git a/egs/wsj/s5/steps/info/nnet2_dir_info.pl b/egs/wsj/s5/steps/info/nnet2_dir_info.pl
index 6ef10a2e03d..e572245e0ca 100755
--- a/egs/wsj/s5/steps/info/nnet2_dir_info.pl
+++ b/egs/wsj/s5/steps/info/nnet2_dir_info.pl
@@ -17,8 +17,8 @@
                "This script extracts some important information from the logs\n" .
                "and displays it on a single (rather long) line.\n" .
                "The --debug option is just to debug the script itself.\n" .
-               "This program exits with status 0 if it seems like the argument\n" .
-               "really was a GMM dir, and 1 otherwise.\n";
+               "This program exits with status 0 if it seems like the arguments\n" .
+               "really were of the expected directory type, and 1 otherwise.\n";
   exit(1);
 }
 
diff --git a/egs/wsj/s5/steps/info/nnet3_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
index 89b4c398d46..46ddd9f822c 100755
--- a/egs/wsj/s5/steps/info/nnet3_dir_info.pl
+++ b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
@@ -17,8 +17,8 @@
                "This script extracts some important information from the logs\n" .
                "and displays it on a single (rather long) line.\n" .
                "The --debug option is just to debug the script itself.\n" .
-               "This program exits with status 0 if it seems like the argument\n" .
-               "really was a GMM dir, and 1 otherwise.\n";
+               "This program exits with status 0 if it seems like the arguments\n" .
+               "really were of the expected directory type, and 1 otherwise.\n";
   exit(1);
 }
 
diff --git a/egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl
new file mode 100755
index 00000000000..10bdb70fc9f
--- /dev/null
+++ b/egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl
@@ -0,0 +1,172 @@
+#!/usr/bin/perl -w
+
+use Fcntl;
+
+# we may at some point support options.
+
+$debug = 0;  # we set it to 1 for debugging the script itself.
+
+if ($ARGV[0] eq "--debug") {
+  $debug = 1;
+  shift @ARGV;
+}
+
+if (@ARGV == 0) {
+  print STDERR "Usage: steps/info/nnet3_disc_dir_info.pl [--debug] <nnet3-disc-dir1> [<nnet3-disc-dir2> ... ]\n" .
+               "e.g: steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_sp_smbr\n" .
+               "This script extracts some important information from the logs\n" .
+               "and displays it on a few lines.\n" .
+               "The --debug option is just to debug the script itself.\n" .
+               "This program exits with status 0 if it seems like the argument\n" .
+               "really was a GMM dir, and 1 otherwise.\n";
+  exit(1);
+}
+
+if (@ARGV > 1) {
+  # repeatedly invoke this program with each of the remaining args.
+  $exit_status = 0;
+  if ($debug) { $debug_opt = "--debug " } else { $debug_opt = ""; }
+  foreach $dir (@ARGV) {
+    if (system("$0 $debug_opt$dir") != 0) {
+      $exit_status = 1;
+    }
+  }
+  exit($exit_status);
+}
+
+# from this point we can assume we're invoked with one argument.
+$nnet_dir = shift @ARGV;
+
+# This function returns an array of iteration numbers, one
+# for each epoch that has already completed (but including
+# epoch zero)... e.g.
+# it might return (0, 194, 388, 582).
+# This is done by reading the soft links, e.g. epoch1.mdl ->194.mdl
+sub get_iters_for_epochs {
+  my @ans = ();
+  for (my $n = 0; 1; $n++) {
+    if (-l "$nnet_dir/epoch$n.mdl") {
+      my $link_name = readlink("$nnet_dir/epoch$n.mdl");
+      if ($link_name =~ m/^(\d+).mdl/) {
+        my $iter = $1;
+        push @ans, $iter;
+      } else {
+        die "unexpected link name $nnet_dir/epoch$n.mdl -> $link_name";
+      }
+    } else {
+      if (@ans == 0) {
+        die "$nnet_dir does not seem to be a discriminative-training dir " .
+          "(expected a link $nnet_dir/epoch0.mdl)";
+      }
+      return @ans;
+    }
+  }
+}
+
+
+sub get_num_jobs {
+  my $j = 1;
+  for (my $j = 1; 1; $j++) {
+    if (! -f "$nnet_dir/log/train.0.$j.log") {
+      if ($j == 1) {
+        die "$nnet_dir does not seem to be a discriminative-training dir " .
+          "(expected $nnet_dir/log/train.0.1.log to exist)";
+      } else {
+        return $j - 1;
+      }
+    }
+  }
+}
+
+# returns a string describing the effective learning rate and possibly
+# any final-layer-factor.
+sub get_effective_learning_rate_str {
+  # effective learning rate is the actual learning rate divided by the
+  # number of jobs.
+  my $convert_log = "$nnet_dir/log/convert.log";
+  if (-f $convert_log) {
+    open(F, "<$convert_log");
+    while (<F>) {
+      if (m/--edits/) {
+        if (m/set-learning-rate learning-rate=(\S+); set-learning-rate name=output.affine learning-rate=([^"']+)["']/) {
+          my $learning_rate = $1;
+          my $last_layer_factor = sprintf("%.2f", $2 / $1);
+          my $num_jobs = get_num_jobs();
+          my $effective_learning_rate = sprintf("%.3g", $learning_rate / $num_jobs);
+          close(F);
+          return "effective-lrate=$effective_learning_rate;last-layer-factor=$last_layer_factor";
+        } elsif (m/set-learning-rate learning-rate=([^"']+)["']/) {
+          my $learning_rate = $1;
+          my $num_jobs = get_num_jobs();
+          my $effective_learning_rate = sprintf("%.3g", $learning_rate / $num_jobs);
+          close(F);
+          return "effective-lrate=$effective_learning_rate";
+        }
+      }
+    }
+  } else {
+    die("Expected file $convert_log to exist");
+  }
+  close(F);
+  return "lrate=??";  # could not parse it from the log.
+}
+
+
+# prints some info about the objective function...
+sub get_objf_str {
+  my @iters_for_epochs = get_iters_for_epochs();
+  if (@iters_for_epochs == 1) {
+    die("No epochs have finished in directory $nnet_dir")
+  }
+  # will produce output like:
+  # iters-per-epoch=123;epoch[0,1,2,3,4]:train-objf=[0.89,0.92,0.93,0.94],valid-objf=[...],train-counts=[...],valid-counts=[...]"
+  # the "counts" are the average num+den occupation counts in the lattices; it's a measure of how much confusability
+  # there still is in the lattices.
+  my $iters_per_epoch = $iters_for_epochs[1] - $iters_for_epochs[0];
+  my $ans = "iters-per-epoch=$iters_per_epoch";
+  $ans .= ";epoch[" . join(",", 0..$#iters_for_epochs) . "]:";
+  my @train_objfs = ();
+  my @train_counts = ();
+  my @valid_objfs = ();
+  my @valid_counts = ();
+  foreach $iter (@iters_for_epochs) {
+    if ($iter > 0) { $iter -= 1; }  # last iter will not exist.
+    my $train_log = "$nnet_dir/log/compute_objf_train.$iter.log";
+    my $valid_log = "$nnet_dir/log/compute_objf_valid.$iter.log";
+    if (!open (T, "<$train_log")){  print STDERR "$0: warning: Expected file $train_log to exist\n"; }
+    if (!open (V, "<$valid_log")){  print STDERR "$0: warning: Expected file $valid_log to exist\n"; }
+    my $train_count = "??";
+    my $valid_count = "??";
+    my $train_objf = "??";
+    my $valid_objf = "??";
+    while (<T>) {
+      if (m/num\+den count.+is (\S+) per frame/) { $train_count = sprintf("%.2f", $1); }
+      if (m/Overall.+ is (\S+) per frame/) { $train_objf = sprintf("%.2f", $1); }
+    }
+    close(T);
+    while (<V>) {
+      if (m/num\+den count.+is (\S+) per frame/) { $valid_count = sprintf("%.2f", $1); }
+      if (m/Overall.+ is (\S+) per frame/) { $valid_objf = sprintf("%.2f", $1); }
+    }
+    push @train_objfs, $train_objf;
+    push @train_counts, $train_count;
+    push @valid_objfs, $valid_objf;
+    push @valid_counts, $valid_count;
+    close(V);
+  }
+  $ans .= "train-objf=[" . join(",", @train_objfs) .
+       "],valid-objf=[" . join(",", @valid_objfs) .
+       "],train-counts=[" . join(",", @train_counts) .
+       "],valid-counts=[" . join(",", @valid_counts) . "]";
+  return $ans;
+}
+
+
+
+
+$output_string = "$nnet_dir:num-jobs=".get_num_jobs().";" .
+     get_effective_learning_rate_str() . ";" . get_objf_str();
+
+print "$output_string\n";
+
+exit(0);

From ac602edbf1955d9fe35b2aca61b2e756cd2dc03b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 22 Jan 2017 20:33:40 -0500
Subject: [PATCH 125/213] [src] nnet3: modifying nnet-combine.{h,cc} to support
 soft enforcement of sum-to-one constraint.

---
 src/nnet3/nnet-combine.cc | 29 +++++++++++++++--------------
 src/nnet3/nnet-combine.h  | 17 ++++++++++++++++-
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index d50b5adc072..657b1972620 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -155,10 +155,10 @@ void NnetCombiner::Combine() {
     KALDI_LOG << "Combining nnets, objective function changed from "
               << initial_objf << " to " << objf;
   } else {
-    Vector<double> weights(WeightDim());
+    Vector<BaseFloat> weights(WeightDim());
     GetWeights(params, &weights);
     bool print_weights = true;
-    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
+    BaseFloat penalty = GetSumToOnePenalty(weights, NULL, print_weights);
     // note: initial_objf has no penalty term because it summed exactly
     // to one.
     KALDI_LOG << "Combining nnets, objective function changed from "
@@ -380,12 +380,12 @@ void NnetCombiner::GetParamsDeriv(const VectorBase<double> &weights,
 
 
 double NnetCombiner::GetSumToOnePenalty(
-    const VectorBase<double> &weights,
-    VectorBase<double> *weights_penalty_deriv,
+    const VectorBase<BaseFloat> &weights,
+    VectorBase<BaseFloat> *weights_penalty_deriv,
     bool print_weights) const {
 
   KALDI_ASSERT(config_.sum_to_one_penalty >= 0.0);
-  double penalty = config_.sum_to_one_penalty;
+  BaseFloat penalty = config_.sum_to_one_penalty;
   if (penalty == 0.0) {
     weights_penalty_deriv->SetZero();
     return 0.0;
@@ -393,13 +393,13 @@ double NnetCombiner::GetSumToOnePenalty(
   double ans = 0.0;
   int32 num_uc = NumUpdatableComponents(),
     num_models = nnet_params_.NumRows();
-  Vector<double> tot_weights(num_uc);
+  Vector<BaseFloat> tot_weights(num_uc);
   std::ostringstream tot_weight_info;
   for (int32 c = 0; c < num_uc; c++) {
     double this_total_weight = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
-      double this_weight = weights(index);
+      BaseFloat this_weight = weights(index);
       this_total_weight += this_weight;
     }
     tot_weights(c) = this_total_weight;
@@ -409,7 +409,7 @@ double NnetCombiner::GetSumToOnePenalty(
       KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
       // this_total_weight_deriv is the derivative of the penalty
       // term w.r.t. this component's total weight.
-      double this_total_weight_deriv =
+      BaseFloat this_total_weight_deriv =
           penalty * (1.0 - this_total_weight);
       for (int32 m = 0; m < num_models; m++) {
         int32 index = m * num_uc + c;
@@ -418,16 +418,15 @@ double NnetCombiner::GetSumToOnePenalty(
     }
   }
   if (print_weights) {
-    Vector<BaseFloat> tot_weights_float(tot_weights);
     KALDI_LOG << "Total weights per component: "
               << PrintVectorPerUpdatableComponent(nnet_,
-                                                  tot_weights_float);
+                                                  tot_weights);
   }
   return ans;
 }
 
 
-void NnetCombiner::GetNnetParameters(const Vector<double> &weights,
+void NnetCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
                                      VectorBase<BaseFloat> *nnet_params) const {
   KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
   nnet_params->SetZero();
@@ -504,9 +503,11 @@ double NnetCombiner::ComputeObjfAndDerivFromNnet(
 
 
 double NnetCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<double> &params,
-    VectorBase<double> *params_deriv) {
-  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
+    VectorBase<BaseFloat> &params,
+    VectorBase<BaseFloat> *params_deriv) {
+  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
+      nnet_params(NnetParameterDim(), kUndefined),
+      nnet_params_deriv(NnetParameterDim(), kUndefined),
       weights_sum_to_one_penalty_deriv(WeightDim()),
       normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
   Vector<BaseFloat>
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
index 5b60d30b8ed..8cb5c80bf26 100644
--- a/src/nnet3/nnet-combine.h
+++ b/src/nnet3/nnet-combine.h
@@ -79,7 +79,7 @@ struct NnetCombineConfig {
                  "on the squared difference between sum(weights) for one component,"
                  " and 1.0. This is like --enforce-sum-to-one, but done in a 'soft' "
                  "way (e.g. maybe useful with dropout).  We suggest small values "
-                 "like 10e-3 (for regular nnets) or 1.0e-04 (for chain models).");
+                 "like 10e-2 (for regular nnets) or 1.0e-03 (for chain models).");
     po->Register("separate-weights-per-component", &separate_weights_per_component,
                  "If true, have a separate weight for each updatable component in "
                  "the nnet.");
@@ -205,6 +205,21 @@ class NnetCombiner {
                             bool print_weights = false) const;
 
 
+  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
+  // weights_penalty_deriv to 0.0; else it computes, for each
+  // updatable component u the total weight w_u, returns the value
+  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
+  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
+  // the result.
+  // Note: config_.sum_to_one_penalty is exclusive with
+  // config_.enforce_sum_to_one, so there is really no distinction between
+  // normalized and unnormalized weights here (since normalization would be a
+  // no-op).
+  double GetSumToOnePenalty(const VectorBase<BaseFloat> &weights,
+                            VectorBase<BaseFloat> *weights_penalty_deriv,
+                            bool print_weights = false) const;
+
+
   // Computes the nnet-parameter vector from the normalized weights and
   // nnet_params_, as a vector.  (See the functions Vectorize() and
   // UnVectorize() for how they relate to the nnet's components' parameters).

From 7a69473d254d5d4063fa6445ffe7399c90abed63 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 22 Jan 2017 21:07:14 -0500
Subject: [PATCH 126/213] [src] nnet3: Changing from floating-point to double
 precision in nnet-combine code.

---
 src/nnet3/nnet-combine.cc | 29 ++++++++++++++---------------
 src/nnet3/nnet-combine.h  | 17 +----------------
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 657b1972620..d50b5adc072 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -155,10 +155,10 @@ void NnetCombiner::Combine() {
     KALDI_LOG << "Combining nnets, objective function changed from "
               << initial_objf << " to " << objf;
   } else {
-    Vector<BaseFloat> weights(WeightDim());
+    Vector<double> weights(WeightDim());
     GetWeights(params, &weights);
     bool print_weights = true;
-    BaseFloat penalty = GetSumToOnePenalty(weights, NULL, print_weights);
+    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
     // note: initial_objf has no penalty term because it summed exactly
     // to one.
     KALDI_LOG << "Combining nnets, objective function changed from "
@@ -380,12 +380,12 @@ void NnetCombiner::GetParamsDeriv(const VectorBase<double> &weights,
 
 
 double NnetCombiner::GetSumToOnePenalty(
-    const VectorBase<BaseFloat> &weights,
-    VectorBase<BaseFloat> *weights_penalty_deriv,
+    const VectorBase<double> &weights,
+    VectorBase<double> *weights_penalty_deriv,
     bool print_weights) const {
 
   KALDI_ASSERT(config_.sum_to_one_penalty >= 0.0);
-  BaseFloat penalty = config_.sum_to_one_penalty;
+  double penalty = config_.sum_to_one_penalty;
   if (penalty == 0.0) {
     weights_penalty_deriv->SetZero();
     return 0.0;
@@ -393,13 +393,13 @@ double NnetCombiner::GetSumToOnePenalty(
   double ans = 0.0;
   int32 num_uc = NumUpdatableComponents(),
     num_models = nnet_params_.NumRows();
-  Vector<BaseFloat> tot_weights(num_uc);
+  Vector<double> tot_weights(num_uc);
   std::ostringstream tot_weight_info;
   for (int32 c = 0; c < num_uc; c++) {
     double this_total_weight = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
-      BaseFloat this_weight = weights(index);
+      double this_weight = weights(index);
       this_total_weight += this_weight;
     }
     tot_weights(c) = this_total_weight;
@@ -409,7 +409,7 @@ double NnetCombiner::GetSumToOnePenalty(
       KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
       // this_total_weight_deriv is the derivative of the penalty
       // term w.r.t. this component's total weight.
-      BaseFloat this_total_weight_deriv =
+      double this_total_weight_deriv =
           penalty * (1.0 - this_total_weight);
       for (int32 m = 0; m < num_models; m++) {
         int32 index = m * num_uc + c;
@@ -418,15 +418,16 @@ double NnetCombiner::GetSumToOnePenalty(
     }
   }
   if (print_weights) {
+    Vector<BaseFloat> tot_weights_float(tot_weights);
     KALDI_LOG << "Total weights per component: "
               << PrintVectorPerUpdatableComponent(nnet_,
-                                                  tot_weights);
+                                                  tot_weights_float);
   }
   return ans;
 }
 
 
-void NnetCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
+void NnetCombiner::GetNnetParameters(const Vector<double> &weights,
                                      VectorBase<BaseFloat> *nnet_params) const {
   KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
   nnet_params->SetZero();
@@ -503,11 +504,9 @@ double NnetCombiner::ComputeObjfAndDerivFromNnet(
 
 
 double NnetCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<BaseFloat> &params,
-    VectorBase<BaseFloat> *params_deriv) {
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
-      nnet_params_deriv(NnetParameterDim(), kUndefined),
+    VectorBase<double> &params,
+    VectorBase<double> *params_deriv) {
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
       weights_sum_to_one_penalty_deriv(WeightDim()),
       normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
   Vector<BaseFloat>
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
index 8cb5c80bf26..5b60d30b8ed 100644
--- a/src/nnet3/nnet-combine.h
+++ b/src/nnet3/nnet-combine.h
@@ -79,7 +79,7 @@ struct NnetCombineConfig {
                  "on the squared difference between sum(weights) for one component,"
                  " and 1.0. This is like --enforce-sum-to-one, but done in a 'soft' "
                  "way (e.g. maybe useful with dropout).  We suggest small values "
-                 "like 10e-2 (for regular nnets) or 1.0e-03 (for chain models).");
+                 "like 10e-3 (for regular nnets) or 1.0e-04 (for chain models).");
     po->Register("separate-weights-per-component", &separate_weights_per_component,
                  "If true, have a separate weight for each updatable component in "
                  "the nnet.");
@@ -205,21 +205,6 @@ class NnetCombiner {
                             bool print_weights = false) const;
 
 
-  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
-  // weights_penalty_deriv to 0.0; else it computes, for each
-  // updatable component u the total weight w_u, returns the value
-  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
-  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
-  // the result.
-  // Note: config_.sum_to_one_penalty is exclusive with
-  // config_.enforce_sum_to_one, so there is really no distinction between
-  // normalized and unnormalized weights here (since normalization would be a
-  // no-op).
-  double GetSumToOnePenalty(const VectorBase<BaseFloat> &weights,
-                            VectorBase<BaseFloat> *weights_penalty_deriv,
-                            bool print_weights = false) const;
-
-
   // Computes the nnet-parameter vector from the normalized weights and
   // nnet_params_, as a vector.  (See the functions Vectorize() and
   // UnVectorize() for how they relate to the nnet's components' parameters).

From 856db5e5533f4a5c746639a6d89947bd0f9e540a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 25 Jan 2017 01:20:05 -0500
Subject: [PATCH 127/213] [src] Adding chain version of the combination changes
 from the last 2 commits

---
 src/nnet3/nnet-chain-combine.cc | 175 ++++++++++++++++++++++----------
 src/nnet3/nnet-chain-combine.h  |  54 ++++++----
 2 files changed, 157 insertions(+), 72 deletions(-)

diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index dd9b99fe26d..b80c585e7fa 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -38,7 +38,13 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
     nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+
+  if (combine_config_.sum_to_one_penalty != 0.0 &&
+      combine_config_.enforce_sum_to_one) {
+    KALDI_WARN << "--sum-to-one-penalty=" << combine_config_.sum_to_one_penalty
+              << " is nonzero, so setting --enforce-sum-to-one=false.";
+    combine_config_.enforce_sum_to_one = false;
+  }
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
@@ -133,12 +139,12 @@ void NnetChainCombiner::Combine() {
                          // itself, so this is BFGS.
   lbfgs_options.first_step_impr = combine_config_.initial_impr;
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  BaseFloat objf, initial_objf;
+  Vector<double> params(dim), deriv(dim);
+  double objf, initial_objf;
   GetInitialParameters(&params);
 
 
-  OptimizeLbfgs<BaseFloat> lbfgs(params, lbfgs_options);
+  OptimizeLbfgs<double> lbfgs(params, lbfgs_options);
 
   for (int32 i = 0; i < combine_config_.num_iters; i++) {
     params.CopyFromVec(lbfgs.GetProposedValue());
@@ -149,12 +155,25 @@ void NnetChainCombiner::Combine() {
     lbfgs.DoStep(objf, deriv);
   }
 
-  KALDI_LOG << "Combining nnets, objective function changed from "
-            << initial_objf << " to " << objf;
+  if (!combine_config_.sum_to_one_penalty) {
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf;
+  } else {
+    Vector<double> weights(WeightDim());
+    GetWeights(params, &weights);
+    bool print_weights = true;
+    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
+    // note: initial_objf has no penalty term because it summed exactly
+    // to one.
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf << " = "
+              << (objf - penalty) << " + " << penalty;
+  }
+
 
   // must recompute nnet_ if "params" is not exactly equal to the
   // final params that LB
-  Vector<BaseFloat> final_params(dim);
+  Vector<double> final_params(dim);
   final_params.CopyFromVec(lbfgs.GetValue(&objf));
   if (!params.ApproxEqual(final_params, 0.0)) {
     // the following call makes sure that nnet_ corresponds to the parameters
@@ -165,9 +184,8 @@ void NnetChainCombiner::Combine() {
 }
 
 
-void NnetChainCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
-
-  Vector<BaseFloat> weights(params.Dim()), normalized_weights(params.Dim());
+void NnetChainCombiner::PrintParams(const VectorBase<double> &params) const {
+  Vector<double> weights(params.Dim()), normalized_weights(params.Dim());
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   int32 num_models = nnet_params_.NumRows(),
@@ -217,21 +235,21 @@ void NnetChainCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
 bool NnetChainCombiner::SelfTestDerivatives() {
   int32 num_tests = 2;  // more properly, this is the number of dimensions in a
                         // single test.
-  BaseFloat delta = 0.001;
+  double delta = 0.001;
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  BaseFloat initial_objf = ComputeObjfAndDerivFromParameters(params,
+  double initial_objf = ComputeObjfAndDerivFromParameters(params,
                                                              &deriv);
   for (int32 i = 0; i < num_tests; i++) {
-    Vector<BaseFloat> new_deriv(dim), offset(dim), new_params(params);
+    Vector<double> new_deriv(dim), offset(dim), new_params(params);
     offset.SetRandn();
     new_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromParameters(new_params,
+    double new_objf = ComputeObjfAndDerivFromParameters(new_params,
                                                            &new_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -239,7 +257,7 @@ bool NnetChainCombiner::SelfTestDerivatives() {
         0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "predicted_changes = " << predicted_changes;
   KALDI_LOG << "observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold)) {
@@ -256,23 +274,23 @@ void NnetChainCombiner::SelfTestModelDerivatives() {
                         // single test.
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
+  Vector<BaseFloat> nnet_params(NnetParameterDim(), kUndefined),
       nnet_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
 
-  BaseFloat initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
+  double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
                                                        &nnet_deriv);
 
-  BaseFloat delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
-                                      NnetParameterDim());
+  double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
+                                   NnetParameterDim());
 
 
   for (int32 i = 0; i < num_tests; i++) {
@@ -280,7 +298,7 @@ void NnetChainCombiner::SelfTestModelDerivatives() {
         offset(NnetParameterDim()), new_nnet_params(nnet_params);
     offset.SetRandn();
     new_nnet_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
+    double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
                                                      &new_nnet_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -290,7 +308,7 @@ void NnetChainCombiner::SelfTestModelDerivatives() {
         0.5 * VecVec(nnet_params, new_nnet_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes;
   KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold))
@@ -308,7 +326,7 @@ int32 NnetChainCombiner::ParameterDim() const {
 }
 
 
-void NnetChainCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) const {
+void NnetChainCombiner::GetInitialParameters(VectorBase<double> *params) const {
   KALDI_ASSERT(params->Dim() == ParameterDim());
   params->Set(1.0 / nnet_params_.NumRows());
   if (combine_config_.enforce_positive_weights) {
@@ -318,8 +336,8 @@ void NnetChainCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) cons
   }
 }
 
-void NnetChainCombiner::GetWeights(const VectorBase<BaseFloat> &params,
-                              VectorBase<BaseFloat> *weights) const {
+void NnetChainCombiner::GetWeights(const VectorBase<double> &params,
+                              VectorBase<double> *weights) const {
   KALDI_ASSERT(weights->Dim() == WeightDim());
   if (combine_config_.separate_weights_per_component) {
     weights->CopyFromVec(params);
@@ -339,12 +357,12 @@ void NnetChainCombiner::GetWeights(const VectorBase<BaseFloat> &params,
 }
 
 
-void NnetChainCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                                  const VectorBase<BaseFloat> &weights_deriv,
-                                  VectorBase<BaseFloat> *param_deriv) {
+void NnetChainCombiner::GetParamsDeriv(const VectorBase<double> &weights,
+                                  const VectorBase<double> &weights_deriv,
+                                  VectorBase<double> *param_deriv) {
   KALDI_ASSERT(weights.Dim() == WeightDim() &&
                param_deriv->Dim() == ParameterDim());
-  Vector<BaseFloat> preexp_weights_deriv(weights_deriv);
+  Vector<double> preexp_weights_deriv(weights_deriv);
   if (combine_config_.enforce_positive_weights) {
     // to enforce positive weights we first compute weights (call these
     // preexp_weights) and then take exponential.  Note, d/dx exp(x) = exp(x).
@@ -363,8 +381,54 @@ void NnetChainCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
   }
 }
 
+double NnetChainCombiner::GetSumToOnePenalty(
+    const VectorBase<double> &weights,
+    VectorBase<double> *weights_penalty_deriv,
+    bool print_weights) const {
 
-void NnetChainCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
+  KALDI_ASSERT(combine_config_.sum_to_one_penalty >= 0.0);
+  double penalty = combine_config_.sum_to_one_penalty;
+  if (penalty == 0.0) {
+    weights_penalty_deriv->SetZero();
+    return 0.0;
+  }
+  double ans = 0.0;
+  int32 num_uc = NumUpdatableComponents(),
+    num_models = nnet_params_.NumRows();
+  Vector<double> tot_weights(num_uc);
+  std::ostringstream tot_weight_info;
+  for (int32 c = 0; c < num_uc; c++) {
+    double this_total_weight = 0.0;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      double this_weight = weights(index);
+      this_total_weight += this_weight;
+    }
+    tot_weights(c) = this_total_weight;
+    ans += -0.5 * penalty *
+           (this_total_weight - 1.0) * (this_total_weight - 1.0);
+    if (weights_penalty_deriv != NULL) {
+      KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
+      // this_total_weight_deriv is the derivative of the penalty
+      // term w.r.t. this component's total weight.
+      double this_total_weight_deriv =
+          penalty * (1.0 - this_total_weight);
+      for (int32 m = 0; m < num_models; m++) {
+        int32 index = m * num_uc + c;
+        (*weights_penalty_deriv)(index) = this_total_weight_deriv;
+      }
+    }
+  }
+  if (print_weights) {
+    Vector<BaseFloat> tot_weights_float(tot_weights);
+    KALDI_LOG << "Total weights per component: "
+              << PrintVectorPerUpdatableComponent(nnet_,
+                                                  tot_weights_float);
+  }
+  return ans;
+}
+
+void NnetChainCombiner::GetNnetParameters(const Vector<double> &weights,
                                      VectorBase<BaseFloat> *nnet_params) const {
   KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
   nnet_params->SetZero();
@@ -390,7 +454,7 @@ void NnetChainCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
 // compare GetNnetParameters.
 void NnetChainCombiner::GetWeightsDeriv(
     const VectorBase<BaseFloat> &nnet_params_deriv,
-    VectorBase<BaseFloat> *weights_deriv) {
+    VectorBase<double> *weights_deriv) {
   KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() &&
                weights_deriv->Dim() == WeightDim());
   int32 num_uc = NumUpdatableComponents(),
@@ -442,30 +506,35 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
 
 
 double NnetChainCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<BaseFloat> &params,
-    VectorBase<BaseFloat> *params_deriv) {
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
-      nnet_params_deriv(NnetParameterDim(), kUndefined),
+    VectorBase<double> &params,
+    VectorBase<double> *params_deriv) {
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
+      weights_sum_to_one_penalty_deriv(WeightDim()),
       normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
+  Vector<BaseFloat>
+      nnet_params(NnetParameterDim(), kUndefined),
+      nnet_params_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
+  double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
-  double ans = ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
+  ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
   if (ans != ans || ans - ans != 0) // NaN or inf
     return ans;  // No point computing derivative
   GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv);
   GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv,
                               &weights_deriv);
+  weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv);
   GetParamsDeriv(weights, weights_deriv, params_deriv);
   return ans;
 }
 
 
-// enforces the constraint that the weights for each component must sum to one.
+// enforces the constraint that the weights for each component must sum to one,
+// if necessary.
 void NnetChainCombiner::GetNormalizedWeights(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    VectorBase<BaseFloat> *norm_weights) const {
+    const VectorBase<double> &unnorm_weights,
+    VectorBase<double> *norm_weights) const {
   if (!combine_config_.enforce_sum_to_one) {
     norm_weights->CopyFromVec(unnorm_weights);
     return;
@@ -473,12 +542,12 @@ void NnetChainCombiner::GetNormalizedWeights(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
+    double inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
                                     // weights and eventually -inf objective.
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
@@ -488,9 +557,9 @@ void NnetChainCombiner::GetNormalizedWeights(
 }
 
 void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    const VectorBase<BaseFloat> &norm_weights_deriv,
-    VectorBase<BaseFloat> *unnorm_weights_deriv) {
+    const VectorBase<double> &unnorm_weights,
+    const VectorBase<double> &norm_weights_deriv,
+    VectorBase<double> *unnorm_weights_deriv) {
   if (!combine_config_.enforce_sum_to_one) {
     unnorm_weights_deriv->CopyFromVec(norm_weights_deriv);
     return;
@@ -498,13 +567,13 @@ void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;
-    BaseFloat inv_sum_deriv = 0.0;
+    double inv_sum = 1.0 / sum;
+    double inv_sum_deriv = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       // in the forward direction, we'd do:
@@ -513,7 +582,7 @@ void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
       inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index);
     }
     // note: d/dx (1/x) = -1/x^2
-    BaseFloat sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
+    double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       (*unnorm_weights_deriv)(index) += sum_deriv;
diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h
index 6ef882ecc38..3aeb3882650 100644
--- a/src/nnet3/nnet-chain-combine.h
+++ b/src/nnet3/nnet-chain-combine.h
@@ -62,7 +62,7 @@ class NnetChainCombiner {
 
   ~NnetChainCombiner() { delete prob_computer_; }
  private:
-  const NnetCombineConfig &combine_config_;
+  NnetCombineConfig combine_config_;
   const chain::ChainTrainingOptions &chain_config_;
 
   const std::vector<NnetChainExample> &egs_;
@@ -87,8 +87,9 @@ class NnetChainCombiner {
   Matrix<BaseFloat> nnet_params_;
 
   // This vector has the same dimension as nnet_params_.NumRows(),
-  // and helps us normalize so each row of nnet_params correspondss to
-  // a weighted average of its inputs.
+  // and helps us normalize so each row of nnet_params corresponds to
+  // a weighted average of its inputs (will be all ones if
+  // config_.max_effective_inputs >= the number of nnets provided).
   Vector<BaseFloat> tot_input_weighting_;
 
   // returns the parameter dimension, i.e. the dimension of the parameters that
@@ -110,7 +111,7 @@ class NnetChainCombiner {
   // Computes the initial parameters.  The parameters are the underlying thing
   // that we optimize; their dimension equals ParameterDim().  They are not the same
   // thing as the nnet parameters.
-  void GetInitialParameters(VectorBase<BaseFloat> *params) const;
+  void GetInitialParameters(VectorBase<double> *params) const;
 
   // Tests that derivatives are accurate.  Prints warning and returns false if not.
   bool SelfTestDerivatives();
@@ -120,33 +121,48 @@ class NnetChainCombiner {
 
 
   // prints the parameters via logging statements.
-  void PrintParams(const VectorBase<BaseFloat> &params) const;
+  void PrintParams(const VectorBase<double> &params) const;
 
   // This function computes the objective function (and its derivative, if the objective
   // function is finite) at the given value of the parameters (the parameters we're optimizing,
   // i.e. the combination weights; not the nnet parameters.  This function calls most of the
   // functions below.
   double ComputeObjfAndDerivFromParameters(
-      VectorBase<BaseFloat> &params,
-      VectorBase<BaseFloat> *params_deriv);
+      VectorBase<double> &params,
+      VectorBase<double> *params_deriv);
 
 
   // Computes the weights from the parameters in a config-dependent way.  The
   // weight dimension is always (the number of updatable components times
   // nnet_params_.NumRows()).
-  void GetWeights(const VectorBase<BaseFloat> &params,
-                  VectorBase<BaseFloat> *weights) const;
+  void GetWeights(const VectorBase<double> &params,
+                  VectorBase<double> *weights) const;
 
   // Given the raw weights: if config_.enforce_sum_to_one, then compute weights
   // with sum-to-one constrint per component included; else just copy input to
   // output.
-  void GetNormalizedWeights(const VectorBase<BaseFloat> &unnorm_weights,
-                            VectorBase<BaseFloat> *norm_weights) const;
+  void GetNormalizedWeights(const VectorBase<double> &unnorm_weights,
+                            VectorBase<double> *norm_weights) const;
+
+  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
+  // weights_penalty_deriv to 0.0; else it computes, for each
+  // updatable component u the total weight w_u, returns the value
+  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
+  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
+  // the result.
+  // Note: config_.sum_to_one_penalty is exclusive with
+  // config_.enforce_sum_to_one, so there is really no distinction between
+  // normalized and unnormalized weights here (since normalization would be a
+  // no-op).
+  double GetSumToOnePenalty(const VectorBase<double> &weights,
+                            VectorBase<double> *weights_penalty_deriv,
+                            bool print_weights = false) const;
+
 
   // Computes the nnet-parameter vector from the normalized weights and
   // nnet_params_, as a vector.  (See the functions Vectorize() and
   // UnVectorize() for how they relate to the nnet's components' parameters).
-  void GetNnetParameters(const Vector<BaseFloat> &normalized_weights,
+  void GetNnetParameters(const Vector<double> &normalized_weights,
                          VectorBase<BaseFloat> *nnet_params) const;
 
   // This function computes the objective function (and its derivative, if the objective
@@ -158,23 +174,23 @@ class NnetChainCombiner {
   // Given an objective-function derivative with respect to the nnet parameters,
   // computes the derivative with respect to the (normalized) weights.
   void GetWeightsDeriv(const VectorBase<BaseFloat> &nnet_params_deriv,
-                       VectorBase<BaseFloat> *normalized_weights_deriv);
+                       VectorBase<double> *normalized_weights_deriv);
 
 
   // Computes the derivative w.r.t. the unnormalized weights, by propagating
   // through the normalization operation.
   // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to
   // unnorm_weights_deriv.
-  void GetUnnormalizedWeightsDeriv(const VectorBase<BaseFloat> &unnorm_weights,
-                                   const VectorBase<BaseFloat> &norm_weights_deriv,
-                                   VectorBase<BaseFloat> *unnorm_weights_deriv);
+  void GetUnnormalizedWeightsDeriv(const VectorBase<double> &unnorm_weights,
+                                   const VectorBase<double> &norm_weights_deriv,
+                                   VectorBase<double> *unnorm_weights_deriv);
 
 
   // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
   // the params
-  void GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                      const VectorBase<BaseFloat> &weight_deriv,
-                      VectorBase<BaseFloat> *param_deriv);
+  void GetParamsDeriv(const VectorBase<double> &weights,
+                      const VectorBase<double> &weight_deriv,
+                      VectorBase<double> *param_deriv);
 
   void ComputeUpdatableComponentDims();
   void FinishPreprocessingInput();

From b9adca346961249ef77d66321a891a3009f164f7 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 24 Jan 2017 23:00:18 +0800
Subject: [PATCH 128/213] python level implementation

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 270 ++++++++++++++------
 1 file changed, 193 insertions(+), 77 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 5e928a0f7c3..6f0e1e0f1c6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -56,9 +56,6 @@ def check_configs(self):
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
 
-        if self.config['delay'] == 0:
-            raise RuntimeError("delay cannot be zero")
-
         for key in ['self-repair-scale-nonlinearity']:
             if self.config[key] < 0.0 or self.config[key] > 1.0:
                 raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))
@@ -251,7 +248,8 @@ def set_default_configs(self):
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        'dropout-proportion' : -1.0 # -1.0 stands for no dropout will be added
+                        'dropout-proportion' : -1.0, # -1.0 stands for no dropout will be added
+                        'dropout-per-frame' : 'false' # default normal dropout mode
                        }
 
     def set_derived_configs(self):
@@ -272,9 +270,6 @@ def check_configs(self):
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
 
-        if self.config['delay'] == 0:
-            raise RuntimeError("delay cannot be zero")
-
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
@@ -292,6 +287,10 @@ def check_configs(self):
              raise RuntimeError("dropout-proportion has invalid value {0}."
                                 "".format(self.config['dropout-proportion']))
 
+        if (self.config['dropout-per-frame'] != 'false' and
+            self.config['dropout-per-frame'] != 'true'):
+            raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))
+
     def auxiliary_outputs(self):
         return ['c_t']
 
@@ -353,6 +352,8 @@ def generate_lstm_config(self):
         pes_str = self.config['ng-per-element-scale-options']
         lstm_dropout_value = self.config['dropout-proportion']
         lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])
+        lstm_dropout_per_frame_value = self.config['dropout-per-frame']
+        lstm_dropout_per_frame_str = 'dropout-per-frame='+str(self.config['dropout-per-frame'])
 
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
@@ -389,6 +390,8 @@ def generate_lstm_config(self):
         configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
         configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
         configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        if lstm_dropout_value != -1.0:
+            configs.append("component name={0}.dropout type=DropoutComponent dim={1} {2} {3}".format(name, cell_dim, lstm_dropout_str, lstm_dropout_per_frame_str))
 
         configs.append("# Defining the components for other cell computations")
         configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
@@ -404,17 +407,29 @@ def generate_lstm_config(self):
         configs.append("# i_t")
         configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
-        configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+        if lstm_dropout_value != -1.0:
+            configs.append("component-node name={0}.i_t_predrop component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+            configs.append("component-node name={0}.i_t component={0}.dropout input={0}.i_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
 
         configs.append("# f_t")
         configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
-        configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+        if lstm_dropout_value != -1.0:
+            configs.append("component-node name={0}.f_t_predrop component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+            configs.append("component-node name={0}.f_t component={0}.dropout input={0}.f_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
 
         configs.append("# o_t")
         configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
-        configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+        if lstm_dropout_value != -1.0:
+            configs.append("component-node name={0}.o_t_predrop component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+            configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
 
         configs.append("# h_t")
         configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
@@ -432,21 +447,134 @@ def generate_lstm_config(self):
 
         # add the recurrent connections
         configs.append("# projection matrices : Wrm and Wpm")
-        if lstm_dropout_value != -1.0:
-            configs.append("component name={0}.W_rp.m.dropout type=DropoutComponent dim={1} {2}".format(name, cell_dim, lstm_dropout_str))
         configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
         configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
 
         configs.append("# r_t and p_t : rp_t will be the output")
-        if lstm_dropout_value != -1.0:
-            configs.append("component-node name={0}.rp_t.dropout component={0}.W_rp.m.dropout input={0}.m_t".format(name))
-            configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.rp_t.dropout".format(name))
-            configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
-            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
-        else:
-            configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
-            configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
-            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
+        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+
+        return configs
+
+# Same as the LSTMP layer except that the matrix multiplications are combined
+# we probably keep only version after experimentation. One year old experiments
+# show that this version is slightly worse and might require some tuning
+class XconfigLstmpcLayer(XconfigLstmpLayer):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "lstmpc-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', ng_per_element_scale_options) is None and \
+           re.search('param-stddev', ng_per_element_scale_options) is None:
+           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
+        pes_str = ng_per_element_scale_options
+
+        configs = []
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("### Begin LTSM layer '{0}'".format(name))
+        configs.append("# Full W_ifoc* matrix")
+        configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+
+        # we will not combine the diagonal matrix operations as one of these has a different delay
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        configs.append("# Defining the components for other cell computations")
+        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        # c1_t and c2_t defined below
+        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
+        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
+        rec_connection = '{0}.rp_t'.format(name)
+
+        component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
+
+
+        offset = 0
+        component_nodes.append("# i_t")
+        component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor))
+        component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+
+        component_nodes.append("# f_t")
+        component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
+        component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+
+        component_nodes.append("# o_t")
+        component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
+        component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+
+        component_nodes.append("# h_t")
+        component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
+
+        component_nodes.append("# g_t")
+        component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
+
+
+        configs.append("# parts of c_t")
+        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
+
+        configs.append("# m_t")
+        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
+
+        # add the recurrent connections
+        configs.append("# projection matrices : Wrm and Wpm")
+        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+
+        configs.append("# r_t and p_t : rp_t will be the output")
+        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
+        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+        configs.append("### End LTSM layer '{0}'".format(name))
 
         return configs
 
@@ -473,15 +601,6 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
-#   decay-time=-1            [If >0, an approximate maximum on how many frames
-#                            can be remembered via summation into the cell
-#                            contents c_t; enforced by putting a scaling factor
-#                            of recurrence_scale = 1 - abs(delay)/decay_time on
-#                            the recurrence, i.e. the term c_{t-1} in the LSTM
-#                            equations.  E.g. setting this to 20 means no more
-#                            than about 20 frames' worth of history,
-#                            i.e. history since about t = t-20, can be
-#                            accumulated in c_t.]
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstm-layer"
@@ -501,8 +620,7 @@ def set_default_configs(self):
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
-                        'ng-affine-options' : ' max-change=1.5',
-                        'decay-time':  -1.0
+                        'ng-affine-options' : ' max-change=1.5'
                         }
         self.c_needed = False  # keep track of whether the 'c' output is needed.
 
@@ -514,8 +632,6 @@ def check_configs(self):
         key = 'cell-dim'
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
-        if self.config['delay'] == 0:
-            raise RuntimeError("delay cannot be zero")
 
 
 
@@ -563,23 +679,17 @@ def generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         delay = self.config['delay']
-        affine_str = self.config['ng-affine-options']
-        decay_time = self.config['decay-time']
-        # we expect decay_time to be either -1, or large, like 10 or 50.
-        recurrence_scale = (1.0 if decay_time < 0 else
-                            1.0 - (abs(delay) / decay_time))
-        assert recurrence_scale > 0   # or user may have set decay-time much
-                                      # too small.
-        lstm_str = self.config['lstm-nonlinearity-options']
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
-                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay), recurrence_scale))
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        lstm_str = self.config['lstm-nonlinearity-options']
+
 
         configs = []
 
@@ -642,15 +752,6 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
-#   decay-time=-1            [If >0, an approximate maximum on how many frames
-#                            can be remembered via summation into the cell
-#                            contents c_t; enforced by putting a scaling factor
-#                            of recurrence_scale = 1 - abs(delay)/decay_time on
-#                            the recurrence, i.e. the term c_{t-1} in the LSTM
-#                            equations.  E.g. setting this to 20 means no more
-#                            than about 20 frames' worth of history,
-#                            i.e. history since about t = t-20, can be
-#                            accumulated in c_t.]
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstmp-layer"
@@ -671,10 +772,10 @@ def set_default_configs(self):
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
-                        'decay-time':  -1.0,
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 15.0
-
+                        'zeroing-threshold' : 15.0,
+                        'dropout-proportion' : -1.0 ,# -1.0 stands for no dropout will be added
+                        'dropout-per-frame' : 'false'  # default normal dropout mode
                         }
 
     def set_derived_configs(self):
@@ -688,14 +789,22 @@ def set_derived_configs(self):
             self.config['non-recurrent-projection-dim'] = \
                self.config['recurrent-projection-dim']
 
+        if ((self.config['dropout-proportion'] > 1.0 or
+             self.config['dropout-proportion'] < 0.0) and
+             self.config['dropout-proportion'] != -1.0 ):
+             raise xparser_error("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
+
+        if (self.config['dropout-per-frame'] != 'false' and
+            self.config['dropout-per-frame'] != 'true'):
+            raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))
+
     def check_configs(self):
         for key in ['cell-dim', 'recurrent-projection-dim',
                     'non-recurrent-projection-dim']:
             if self.config[key] <= 0:
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
-        if self.config['delay'] == 0:
-            raise RuntimeError("delay cannot be zero")
+
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
@@ -747,28 +856,23 @@ def generate_lstm_config(self):
         input_dim = self.descriptors['input']['dim']
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
-        delay = self.config['delay']
         rec_proj_dim = self.config['recurrent-projection-dim']
         nonrec_proj_dim = self.config['non-recurrent-projection-dim']
-        affine_str = self.config['ng-affine-options']
-        decay_time = self.config['decay-time']
-        # we expect decay_time to be either -1, or large, like 10 or 50.
-        recurrence_scale = (1.0 if decay_time < 0 else
-                            1.0 - (abs(delay) / decay_time))
-        assert recurrence_scale > 0   # or user may have set decay-time much
-                                      # too small.
-
+        delay = self.config['delay']
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
-                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay), recurrence_scale))
-
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
         lstm_str = self.config['lstm-nonlinearity-options']
+        lstm_dropout_value = self.config['dropout-proportion']
+        lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])
+        lstm_dropout_per_frame_value = self.config['dropout-per-frame']
+        lstm_dropout_per_frame_str = 'dropout-per-frame='+str(self.config['dropout-per-frame'])
 
         configs = []
 
@@ -787,6 +891,8 @@ def generate_lstm_config(self):
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent "
                        "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str))
+        if lstm_dropout_value != -1.0:
+            configs.append("component name={0}.cr_trunc.dropout type=DropoutComponent dim={1} {2} {3}".format(name, cell_dim + rec_proj_dim, lstm_dropout_str, lstm_dropout_per_frame_str))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
         configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
@@ -808,11 +914,21 @@ def generate_lstm_config(self):
         configs.append("# Note: it's not 100% efficient that we have to stitch the c")
         configs.append("# and r back together to truncate them but it probably");
         configs.append("# makes the deriv truncation more accurate .")
-        configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
-                       "input=Append({0}.c, {0}.r)".format(name))
-        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
-                       "dim-offset=0 dim={1}".format(name, cell_dim))
-        configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
-                       "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
-        configs.append("### End LSTM Layer '{0}'".format(name))
+        if lstm_dropout_value != -1.0:
+            configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
+                           "input=Append({0}.c, {0}.r)".format(name))
+            configs.append("component-node name={0}.cr_trunc.dropout component={0}.cr_trunc.dropout input={0}.cr_trunc".format(name))
+            configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc.dropout "
+                           "dim-offset=0 dim={1}".format(name, cell_dim))
+            configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc.dropout "
+                           "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+            configs.append("### End LSTM Layer '{0}'".format(name))
+        else:
+            configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
+                           "input=Append({0}.c, {0}.r)".format(name))
+            configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
+                           "dim-offset=0 dim={1}".format(name, cell_dim))
+            configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
+                           "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+            configs.append("### End LSTM Layer '{0}'".format(name))
         return configs

From f2d999b8416f38e2a062dea3b6d70849630ecf48 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 26 Jan 2017 21:36:03 -0500
Subject: [PATCH 129/213] [egs]: fix some soft links

---
 egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh      | 2 +-
 egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 120000 egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh

diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
index 8e647598556..fbc28248491 120000
--- a/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1e.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh
new file mode 120000
index 00000000000..d4268b4185a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1e_disc.sh
\ No newline at end of file

From 9333bcdac4e09063ed71b25d5304af08f75a78a3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 27 Jan 2017 21:09:47 -0500
Subject: [PATCH 130/213] [src,egs,scripts]: improve use of sum-to-one penalty
 in combination, provide script support; examples of use of dropout in
 TDNN+LSTMs; change minibatch-size in combination phase.

---
 .../s5_r2/local/chain/compare_wer_general.sh  |   8 +-
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1k.sh    | 339 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1l.sh    | 330 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    | 330 +++++++++++++++++
 .../nnet3/train/chain_objf/acoustic_model.py  |  12 +-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   |   5 +
 .../libs/nnet3/train/dropout_schedule.py      |  16 +-
 .../nnet3/train/frame_level_objf/common.py    |  20 +-
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 253 +++++--------
 egs/wsj/s5/steps/nnet3/chain/train.py         |   4 +-
 egs/wsj/s5/steps/nnet3/train_dnn.py           |   5 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |   5 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |   4 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py           |   4 +-
 src/nnet3/nnet-chain-combine.cc               |  25 +-
 src/nnet3/nnet-combine.cc                     |  25 +-
 17 files changed, 1174 insertions(+), 213 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh

diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
index 21ba1720e3a..d3acae200b8 100755
--- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -77,28 +77,28 @@ if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
 
-echo -n "Final train prob     "
+echo -n "# Final train prob     "
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final valid prob     "
+echo -n "# Final valid prob     "
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final train prob (xent)"
+echo -n "# Final train prob (xent)"
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final valid prob (xent)"
+echo -n "# Final valid prob (xent)"
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index 5bfdc68fa3f..7f0b9588b66 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -73,7 +73,7 @@ frames_per_chunk_primary=140
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
 tdnn_lstm_affix=1e  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+common_egs_dir=    # you can set this to use previously dumped egs.
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
new file mode 100755
index 00000000000..ab9d6ce6342
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -0,0 +1,339 @@
+#!/bin/bash
+
+# 1k is as 1e, but introducing a dropout schedule.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.0
+#         [looped:]           9.0       8.6       8.9       8.9
+# WER on dev(rescored)        8.4       7.9       8.2       8.2
+#         [looped:]           8.4       7.8       8.2       8.3
+# WER on test(orig)           8.8       8.8       8.9       8.9
+#         [looped:]           8.8       8.7       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.2       8.5
+#         [looped:]           8.3       8.3       8.3       8.4
+# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807
+# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931
+# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807
+# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629
+
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1k  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.7@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
new file mode 100755
index 00000000000..e09df86558a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+
+# 1l is as 1k, but having the dropout end at the end of training, not @0.75.
+
+# see run_tdnn_lstm_1k.sh for results.
+
+
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1l  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.7@0.5,0' \
+    --trainer.optimization.combine-sum-to-one-penalty=0.001 \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
new file mode 100755
index 00000000000..3e75c9fe3e0
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+
+# 1m is as 1l, but having the dropout end at 0.1
+# see run_tdnn_lstm_1k.sh for results.
+
+# 1l is as 1k, but having the dropout end at the end of training.
+
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1m  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.7@0.5,0.1' \
+    --trainer.optimization.combine-sum-to-one-penalty=0.001 \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 69eb0f52e3b..b90349a6fac 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -515,7 +515,9 @@ def compute_progress(dir, iter, run_opts, wait=False,
 def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, left_context, right_context,
                    leaky_hmm_coefficient, l2_regularize,
-                   xent_regularize, run_opts, background_process_handler=None):
+                   xent_regularize, run_opts,
+                   background_process_handler=None,
+                   sum_to_one_penalty=0.0):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -539,9 +541,11 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     common_lib.run_job(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-chain-combine --num-iters=40 \
+                nnet3-chain-combine --num-iters=80 \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-                --enforce-sum-to-one=true --enforce-positive-weights=true \
+                --enforce-sum-to-one={hard_enforce} \
+                --sum-to-one-penalty={penalty} \
+                --enforce-positive-weights=true \
                 --verbose=3 {dir}/den.fst {raw_models} \
                 "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
                     --right-context={rc} ark:{egs_dir}/combine.cegs ark:- | \
@@ -554,6 +558,8 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                     lc=left_context, rc=right_context,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
+                    hard_enforce=(sum_to_one_penalty <= 0),
+                    penalty=sum_to_one_penalty,
                     num_chunk_per_mb=num_chunk_per_minibatch_str,
                     num_iters=num_iters,
                     egs_dir=egs_dir))
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index af8e9793f0a..50bdc780a20 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -717,6 +717,11 @@ def __init__(self,
                                  the final model combination stage.  These
                                  models will themselves be averages of
                                  iteration-number ranges""")
+        self.parser.add_argument("--trainer.optimization.combine-sum-to-one-penalty",
+                                 type=float, dest='combine_sum_to_one_penalty', default=0.0,
+                                 help="""If > 0, activates 'soft' enforcement of the
+                                 sum-to-one penalty in combination (may be helpful
+                                 if using dropout).  E.g. 1.0e-03.""")
         self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                  dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
index d9cf3112e4a..0ad93e5977d 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
@@ -13,6 +13,8 @@
 logger.addHandler(logging.NullHandler())
 
 
+_debug_dropout = False
+
 def _parse_dropout_option(dropout_option):
     """Parses the string option to --trainer.dropout-schedule and
     returns a list of dropout schedules for different component name patterns.
@@ -53,11 +55,12 @@ def _parse_dropout_option(dropout_option):
         this_dropout_values = _parse_dropout_string(this_dropout_str)
         dropout_schedule.append((component_name, this_dropout_values))
 
-    logger.info("Dropout schedules for component names is as follows:")
-    logger.info("<component-name-pattern>: [(num_archives_processed), "
-                "(dropout_proportion) ...]")
-    for name, schedule in dropout_schedule:
-        logger.info("{0}: {1}".format(name, schedule))
+    if _debug_dropout:
+        logger.info("Dropout schedules for component names is as follows:")
+        logger.info("<component-name-pattern>: [(num_archives_processed), "
+                    "(dropout_proportion) ...]")
+        for name, schedule in dropout_schedule:
+            logger.info("{0}: {1}".format(name, schedule))
 
     return dropout_schedule
 
@@ -236,7 +239,8 @@ def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
         dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
             component_name, dropout_proportion))
 
-    logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info))
+    if _debug_dropout:
+        logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info))
     return ("""nnet3-copy --edits='{edits}' - - |""".format(
         edits=";".join(edit_config_lines)))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 377a0575266..f3955772945 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -437,8 +437,10 @@ def compute_progress(dir, iter, egs_dir, left_context, right_context,
 
 def combine_models(dir, num_iters, models_to_combine, egs_dir,
                    left_context, right_context,
+                   minibatch_size_str,
                    run_opts, background_process_handler=None,
-                   chunk_width=None, get_raw_nnet_from_am=True):
+                   chunk_width=None, get_raw_nnet_from_am=True,
+                   sum_to_one_penalty=0.0):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -464,12 +466,6 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                 raise Exception('Model file {0} missing'.format(model_file))
             raw_model_strings.append(model_file)
 
-    if chunk_width is not None:
-        # this is an RNN model
-        mbsize = int(1024.0/(common_train_lib.principal_chunk_width(chunk_width)))
-    else:
-        mbsize = 1024
-
     if get_raw_nnet_from_am:
         out_model = ("| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl "
                      "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters))
@@ -481,8 +477,10 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
 
     common_lib.run_job(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-combine --num-iters=40 \
-                --enforce-sum-to-one=true --enforce-positive-weights=true \
+                nnet3-combine --num-iters=80 \
+                --enforce-sum-to-one={hard_enforce} \
+                --sum-to-one-penalty={penalty} \
+                --enforce-positive-weights=true \
                 --verbose=3 {raw_models} \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/combine.egs ark:- | \
@@ -492,8 +490,10 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
         """.format(command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,
                    dir=dir, raw_models=" ".join(raw_model_strings),
+                   hard_enforce=(sum_to_one_penalty <= 0),
+                   penalty=sum_to_one_penalty,
                    context_opts=context_opts,
-                   mbsize=mbsize,
+                   mbsize=minibatch_size_str,
                    out_model=out_model,
                    egs_dir=egs_dir))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 6f0e1e0f1c6..9d7f649c4b4 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -56,6 +56,9 @@ def check_configs(self):
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
 
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
         for key in ['self-repair-scale-nonlinearity']:
             if self.config[key] < 0.0 or self.config[key] > 1.0:
                 raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))
@@ -248,8 +251,8 @@ def set_default_configs(self):
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        'dropout-proportion' : -1.0, # -1.0 stands for no dropout will be added
-                        'dropout-per-frame' : 'false' # default normal dropout mode
+                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : False  # If false, regular dropout, not per frame.
                        }
 
     def set_derived_configs(self):
@@ -270,6 +273,9 @@ def check_configs(self):
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
 
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
@@ -285,11 +291,7 @@ def check_configs(self):
              self.config['dropout-proportion'] < 0.0) and
              self.config['dropout-proportion'] != -1.0 ):
              raise RuntimeError("dropout-proportion has invalid value {0}."
-                                "".format(self.config['dropout-proportion']))
-
-        if (self.config['dropout-per-frame'] != 'false' and
-            self.config['dropout-per-frame'] != 'true'):
-            raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))
+                                .format(self.config['dropout-proportion']))
 
     def auxiliary_outputs(self):
         return ['c_t']
@@ -350,10 +352,8 @@ def generate_lstm_config(self):
                                 abs(delay)))
         affine_str = self.config['ng-affine-options']
         pes_str = self.config['ng-per-element-scale-options']
-        lstm_dropout_value = self.config['dropout-proportion']
-        lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])
-        lstm_dropout_per_frame_value = self.config['dropout-per-frame']
-        lstm_dropout_per_frame_str = 'dropout-per-frame='+str(self.config['dropout-per-frame'])
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false'
 
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
@@ -390,9 +390,10 @@ def generate_lstm_config(self):
         configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
         configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
         configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        if lstm_dropout_value != -1.0:
-            configs.append("component name={0}.dropout type=DropoutComponent dim={1} {2} {3}".format(name, cell_dim, lstm_dropout_str, lstm_dropout_per_frame_str))
-
+        if dropout_proportion != -1.0:
+            configs.append("component name={0}.dropout type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
         configs.append("# Defining the components for other cell computations")
         configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
         configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
@@ -407,7 +408,7 @@ def generate_lstm_config(self):
         configs.append("# i_t")
         configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
-        if lstm_dropout_value != -1.0:
+        if dropout_proportion != -1.0:
             configs.append("component-node name={0}.i_t_predrop component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
             configs.append("component-node name={0}.i_t component={0}.dropout input={0}.i_t_predrop".format(name))
         else:
@@ -416,7 +417,7 @@ def generate_lstm_config(self):
         configs.append("# f_t")
         configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
-        if lstm_dropout_value != -1.0:
+        if dropout_proportion != -1.0:
             configs.append("component-node name={0}.f_t_predrop component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
             configs.append("component-node name={0}.f_t component={0}.dropout input={0}.f_t_predrop".format(name))
         else:
@@ -425,7 +426,7 @@ def generate_lstm_config(self):
         configs.append("# o_t")
         configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
-        if lstm_dropout_value != -1.0:
+        if dropout_proportion != -1.0:
             configs.append("component-node name={0}.o_t_predrop component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
             configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name))
         else:
@@ -457,127 +458,6 @@ def generate_lstm_config(self):
 
         return configs
 
-# Same as the LSTMP layer except that the matrix multiplications are combined
-# we probably keep only version after experimentation. One year old experiments
-# show that this version is slightly worse and might require some tuning
-class XconfigLstmpcLayer(XconfigLstmpLayer):
-    def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "lstmpc-layer"
-        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
-
-    # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
-        # assign some variables to reduce verbosity
-        name = self.name
-        # in the below code we will just call descriptor_strings as descriptors for conciseness
-        input_dim = self.descriptors['input']['dim']
-        input_descriptor = self.descriptors['input']['final-string']
-        cell_dim = self.config['cell-dim']
-        rec_proj_dim = self.config['recurrent-projection-dim']
-        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
-        delay = self.config['delay']
-
-        repair_nonlin = self.config['self-repair-scale-nonlinearity']
-        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
-        bptrunc_str = ("clipping-threshold={0}"
-                      " zeroing-threshold={1}"
-                      " zeroing-interval={2}"
-                      " recurrence-interval={3}"
-                      "".format(self.config['clipping-threshold'],
-                                self.config['zeroing-threshold'],
-                                self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
-        # Natural gradient per element scale parameters
-        # TODO: decide if we want to keep exposing these options
-        if re.search('param-mean', ng_per_element_scale_options) is None and \
-           re.search('param-stddev', ng_per_element_scale_options) is None:
-           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
-        pes_str = ng_per_element_scale_options
-
-        configs = []
-        # naming convention
-        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
-        configs.append("### Begin LTSM layer '{0}'".format(name))
-        configs.append("# Full W_ifoc* matrix")
-        configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str))
-        configs.append("# note : the cell outputs pass through a diagonal matrix")
-
-        # we will not combine the diagonal matrix operations as one of these has a different delay
-        configs.append("# note : the cell outputs pass through a diagonal matrix")
-        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-
-        configs.append("# Defining the non-linearities")
-        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-
-        configs.append("# Defining the components for other cell computations")
-        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
-
-        # c1_t and c2_t defined below
-        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
-        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
-        rec_connection = '{0}.rp_t'.format(name)
-
-        component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
-
-
-        offset = 0
-        component_nodes.append("# i_t")
-        component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor))
-        component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
-
-        component_nodes.append("# f_t")
-        component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
-        component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
-
-        component_nodes.append("# o_t")
-        component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
-        component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
-
-        component_nodes.append("# h_t")
-        component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
-
-        component_nodes.append("# g_t")
-        component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
-
-
-        configs.append("# parts of c_t")
-        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
-        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
-
-        configs.append("# m_t")
-        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
-
-        # add the recurrent connections
-        configs.append("# projection matrices : Wrm and Wpm")
-        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
-        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
-
-        configs.append("# r_t and p_t : rp_t will be the output")
-        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
-        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
-        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
-        configs.append("### End LTSM layer '{0}'".format(name))
-
-        return configs
-
 
 # This class is for lines like
 #   'fast-lstm-layer name=lstm1 input=[-1] delay=-3'
@@ -601,6 +481,15 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstm-layer"
@@ -620,7 +509,8 @@ def set_default_configs(self):
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
-                        'ng-affine-options' : ' max-change=1.5'
+                        'ng-affine-options' : ' max-change=1.5',
+                        'decay-time':  -1.0
                         }
         self.c_needed = False  # keep track of whether the 'c' output is needed.
 
@@ -632,6 +522,8 @@ def check_configs(self):
         key = 'cell-dim'
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
 
 
 
@@ -679,17 +571,23 @@ def generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         delay = self.config['delay']
+        affine_str = self.config['ng-affine-options']
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
+        lstm_str = self.config['lstm-nonlinearity-options']
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
-        lstm_str = self.config['lstm-nonlinearity-options']
-
+                                abs(delay), recurrence_scale))
 
         configs = []
 
@@ -752,6 +650,15 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstmp-layer"
@@ -772,10 +679,11 @@ def set_default_configs(self):
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
+                        'decay-time':  -1.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        'dropout-proportion' : -1.0 ,# -1.0 stands for no dropout will be added
-                        'dropout-per-frame' : 'false'  # default normal dropout mode
+                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : False  # If false, regular dropout, not per frame.
                         }
 
     def set_derived_configs(self):
@@ -789,14 +697,6 @@ def set_derived_configs(self):
             self.config['non-recurrent-projection-dim'] = \
                self.config['recurrent-projection-dim']
 
-        if ((self.config['dropout-proportion'] > 1.0 or
-             self.config['dropout-proportion'] < 0.0) and
-             self.config['dropout-proportion'] != -1.0 ):
-             raise xparser_error("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
-
-        if (self.config['dropout-per-frame'] != 'false' and
-            self.config['dropout-per-frame'] != 'true'):
-            raise xparser_error("dropout-per-frame has invalid value {0}.".format(self.config['dropout-per-frame']))
 
     def check_configs(self):
         for key in ['cell-dim', 'recurrent-projection-dim',
@@ -804,12 +704,18 @@ def check_configs(self):
             if self.config[key] <= 0:
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
-
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
             raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                 "cell dim")
+        if ((self.config['dropout-proportion'] > 1.0 or
+             self.config['dropout-proportion'] < 0.0) and
+             self.config['dropout-proportion'] != -1.0 ):
+            raise RuntimeError("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
+
 
 
     def auxiliary_outputs(self):
@@ -856,23 +762,30 @@ def generate_lstm_config(self):
         input_dim = self.descriptors['input']['dim']
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
         rec_proj_dim = self.config['recurrent-projection-dim']
         nonrec_proj_dim = self.config['non-recurrent-projection-dim']
-        delay = self.config['delay']
+        affine_str = self.config['ng-affine-options']
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
+
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
+                                abs(delay), recurrence_scale))
+
         lstm_str = self.config['lstm-nonlinearity-options']
-        lstm_dropout_value = self.config['dropout-proportion']
-        lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])
-        lstm_dropout_per_frame_value = self.config['dropout-per-frame']
-        lstm_dropout_per_frame_str = 'dropout-per-frame='+str(self.config['dropout-per-frame'])
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false'
 
         configs = []
 
@@ -891,8 +804,10 @@ def generate_lstm_config(self):
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent "
                        "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str))
-        if lstm_dropout_value != -1.0:
-            configs.append("component name={0}.cr_trunc.dropout type=DropoutComponent dim={1} {2} {3}".format(name, cell_dim + rec_proj_dim, lstm_dropout_str, lstm_dropout_per_frame_str))
+        if dropout_proportion != -1.0:
+            configs.append("component name={0}.cr_trunc.dropout type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim + rec_proj_dim, dropout_proportion, dropout_per_frame))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
         configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
@@ -914,21 +829,19 @@ def generate_lstm_config(self):
         configs.append("# Note: it's not 100% efficient that we have to stitch the c")
         configs.append("# and r back together to truncate them but it probably");
         configs.append("# makes the deriv truncation more accurate .")
-        if lstm_dropout_value != -1.0:
-            configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
-                           "input=Append({0}.c, {0}.r)".format(name))
+        configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
+                       "input=Append({0}.c, {0}.r)".format(name))
+        if dropout_proportion != -1.0:
             configs.append("component-node name={0}.cr_trunc.dropout component={0}.cr_trunc.dropout input={0}.cr_trunc".format(name))
             configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc.dropout "
                            "dim-offset=0 dim={1}".format(name, cell_dim))
             configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc.dropout "
                            "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
-            configs.append("### End LSTM Layer '{0}'".format(name))
         else:
-            configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
-                           "input=Append({0}.c, {0}.r)".format(name))
             configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
                            "dim-offset=0 dim={1}".format(name, cell_dim))
             configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
                            "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
-            configs.append("### End LSTM Layer '{0}'".format(name))
+        configs.append("### End LSTM Layer '{0}'".format(name))
+
         return configs
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 90e11d0a83e..1791aee665b 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -505,7 +505,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             l2_regularize=args.l2_regularize,
             xent_regularize=args.xent_regularize,
             run_opts=run_opts,
-            background_process_handler=background_process_handler)
+            background_process_handler=background_process_handler,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
+
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory "
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index b5ed26499a4..9874b03051a 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -355,8 +355,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             models_to_combine=models_to_combine,
             egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
-            run_opts=run_opts,
-            background_process_handler=background_process_handler)
+            minibatch_size_str=args.minibatch_size, run_opts=run_opts,
+            background_process_handler=background_process_handler,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index a26e0aa75cf..48e647d9c5e 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -360,9 +360,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             dir=args.dir, num_iters=num_iters,
             models_to_combine=models_to_combine, egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
-            run_opts=run_opts,
+            minibatch_size_str=args.minibatch_size, run_opts=run_opts,
             background_process_handler=background_process_handler,
-            get_raw_nnet_from_am=False)
+            get_raw_nnet_from_am=False,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 44f4cca8cb6..97ab378f5fd 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -447,9 +447,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             dir=args.dir, num_iters=num_iters,
             models_to_combine=models_to_combine, egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
+            minibatch_size_str=args.num_chunk_per_minibatch,
             run_opts=run_opts, chunk_width=args.chunk_width,
             background_process_handler=background_process_handler,
-            get_raw_nnet_from_am=False)
+            get_raw_nnet_from_am=False,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index d546377a726..19da38db958 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -442,8 +442,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             models_to_combine=models_to_combine, egs_dir=egs_dir,
             run_opts=run_opts,
             left_context=left_context, right_context=right_context,
+            minibatch_size_str=args.num_chunk_per_minibatch,
             background_process_handler=background_process_handler,
-            chunk_width=args.chunk_width)
+            chunk_width=args.chunk_width,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index b80c585e7fa..09f01cd947b 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -405,14 +405,27 @@ double NnetChainCombiner::GetSumToOnePenalty(
       this_total_weight += this_weight;
     }
     tot_weights(c) = this_total_weight;
-    ans += -0.5 * penalty *
-           (this_total_weight - 1.0) * (this_total_weight - 1.0);
+    // this_total_weight_deriv is the derivative of the penalty
+    // term w.r.t. this component's total weight.
+    double this_total_weight_deriv;
+    if (combine_config_.enforce_positive_weights) {
+      // if combine_config_.enforce_positive_weights is true, then we choose to
+      // formulate the penalty in a slightly different way.. this solves the
+      // problem that with the formulation in the 'else' below, if for some
+      // reason the total weight is << 1.0, the deriv w.r.t. the actual
+      // parameters gets tiny [because weight = exp(params)].
+      double log_total = log(this_total_weight);
+      ans += -0.5 * penalty * log_total * log_total;
+      double log_total_deriv = -1.0 * penalty * log_total;
+      this_total_weight_deriv = log_total_deriv / this_total_weight;
+    } else {
+      ans += -0.5 * penalty *
+             (this_total_weight - 1.0) * (this_total_weight - 1.0);
+      this_total_weight_deriv = penalty * (1.0 - this_total_weight);
+
+    }
     if (weights_penalty_deriv != NULL) {
       KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
-      // this_total_weight_deriv is the derivative of the penalty
-      // term w.r.t. this component's total weight.
-      double this_total_weight_deriv =
-          penalty * (1.0 - this_total_weight);
       for (int32 m = 0; m < num_models; m++) {
         int32 index = m * num_uc + c;
         (*weights_penalty_deriv)(index) = this_total_weight_deriv;
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index d50b5adc072..207cfbe8269 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -403,14 +403,27 @@ double NnetCombiner::GetSumToOnePenalty(
       this_total_weight += this_weight;
     }
     tot_weights(c) = this_total_weight;
-    ans += -0.5 * penalty *
-           (this_total_weight - 1.0) * (this_total_weight - 1.0);
+    // this_total_weight_deriv is the derivative of the penalty
+    // term w.r.t. this component's total weight.
+    double this_total_weight_deriv;
+    if (config_.enforce_positive_weights) {
+      // if config_.enforce_positive_weights is true, then we choose to
+      // formulate the penalty in a slightly different way.. this solves the
+      // problem that with the formulation in the 'else' below, if for some
+      // reason the total weight is << 1.0, the deriv w.r.t. the actual
+      // parameters gets tiny [because weight = exp(params)].
+      double log_total = log(this_total_weight);
+      ans += -0.5 * penalty * log_total * log_total;
+      double log_total_deriv = -1.0 * penalty * log_total;
+      this_total_weight_deriv = log_total_deriv / this_total_weight;
+    } else {
+      ans += -0.5 * penalty *
+             (this_total_weight - 1.0) * (this_total_weight - 1.0);
+      this_total_weight_deriv = penalty * (1.0 - this_total_weight);
+
+    }
     if (weights_penalty_deriv != NULL) {
       KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
-      // this_total_weight_deriv is the derivative of the penalty
-      // term w.r.t. this component's total weight.
-      double this_total_weight_deriv =
-          penalty * (1.0 - this_total_weight);
       for (int32 m = 0; m < num_models; m++) {
         int32 index = m * num_uc + c;
         (*weights_penalty_deriv)(index) = this_total_weight_deriv;

From f583faebece0de97dc99fa1fa431a2aaec9eca48 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 29 Jan 2017 00:32:22 -0500
Subject: [PATCH 131/213] Merging changes from master into upstream/shortcut

---
 .../nnet3/train/chain_objf/acoustic_model.py  | 23 ++++++++++----
 .../nnet3/train/frame_level_objf/common.py    | 30 ++++++++++++++-----
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index b90349a6fac..fde8ae65461 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -230,7 +230,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         leaky_hmm_coefficient,
                         momentum, max_param_change, shuffle_buffer_size,
                         frame_subsampling_factor,
-                        run_opts, background_process_handler=None):
+                        run_opts, dropout_edit_string="",
+                        background_process_handler=None):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
 
@@ -244,9 +245,10 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     if os.path.exists('{0}/srand'.format(dir)):
         try:
             saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
-        except (IOError, ValueError) as e:
-            raise Exception("Exception while reading the random seed "
-                            "for training: {0}".format(e.str()))
+        except (IOError, ValueError):
+            logger.error("Exception while reading the random seed "
+                         "for training")
+            raise
         if srand != saved_srand:
             logger.warning("The random seed provided to this iteration "
                            "(srand={0}) is different from the one saved last "
@@ -310,6 +312,17 @@ def train_one_iteration(dir, iter, srand, egs_dir,
             num_chunk_per_minibatch_str)
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
+    raw_model_string = raw_model_string + dropout_edit_string
+
+    shrink_info_str = ''
+    if shrinkage_value != 1.0:
+        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)
+
+    logger.info("On iteration {0}, learning rate is {1}"
+                "{shrink_info}.".format(
+                    iter, learning_rate,
+                    shrink_info=shrink_info_str))
+
     train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                      num_archives_processed=num_archives_processed,
                      num_archives=num_archives,
@@ -530,7 +543,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     models_to_combine.add(num_iters)
 
-    for iter in models_to_combine:
+    for iter in sorted(models_to_combine):
         model_file = '{0}/{1}.mdl'.format(dir, iter)
         if os.path.exists(model_file):
             raw_model_strings.append(
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index f3955772945..25fd94d98ff 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -136,7 +136,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         momentum, max_param_change, shuffle_buffer_size,
                         run_opts, frames_per_eg=-1,
                         min_deriv_time=None, max_deriv_time_relative=None,
-                        shrinkage_value=1.0,
+                        shrinkage_value=1.0, dropout_edit_string="",
                         get_raw_nnet_from_am=True,
                         background_process_handler=None):
     """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
@@ -163,9 +163,10 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     if os.path.exists('{0}/srand'.format(dir)):
         try:
             saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
-        except (IOError, ValueError) as e:
-            raise Exception("Exception while reading the random seed "
-                            "for training: {0}".format(e.str()))
+        except (IOError, ValueError):
+            logger.error("Exception while reading the random seed "
+                         "for training")
+            raise
         if srand != saved_srand:
             logger.warning("The random seed provided to this iteration "
                            "(srand={0}) is different from the one saved last "
@@ -238,6 +239,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                                 "{dir}/{iter}.raw - |".format(
                                     lr=learning_rate, dir=dir, iter=iter))
 
+    raw_model_string = raw_model_string + dropout_edit_string
+
     if do_average:
         cur_minibatch_size_str = minibatch_size_str
         cur_max_param_change = max_param_change
@@ -255,6 +258,15 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
+    shrink_info_str = ''
+    if shrinkage_value != 1.0:
+        shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value)
+
+    logger.info("On iteration {0}, learning rate is {1}"
+                "{shrink_info}.".format(
+                    iter, learning_rate,
+                    shrink_info=shrink_info_str))
+
     train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                      num_archives_processed=num_archives_processed,
                      num_archives=num_archives,
@@ -296,7 +308,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         for i in range(1, num_jobs + 1):
             os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
     except OSError:
-        raise Exception("Error while trying to delete the raw models")
+        logger.error("Error while trying to delete the raw models")
+        raise
 
     if get_raw_nnet_from_am:
         new_model = "{0}/{1}.mdl".format(dir, iter + 1)
@@ -346,8 +359,9 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
         try:
             os.remove(file)
         except OSError:
-            raise Exception("There was error while trying to remove "
-                            "lda stat files.")
+            logger.error("There was error while trying to remove "
+                         "lda stat files.")
+            raise
     # this computes a fixed affine transform computed in the way we described
     # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
     # variant of an LDA transform but without dimensionality reduction.
@@ -453,7 +467,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
 
     models_to_combine.add(num_iters)
 
-    for iter in models_to_combine:
+    for iter in sorted(models_to_combine):
         if get_raw_nnet_from_am:
             model_file = '{0}/{1}.mdl'.format(dir, iter)
             if not os.path.exists(model_file):

From b78e5ccc9a8be07d0dfb8c8c9b9fd58160efcf7e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Jan 2017 17:04:31 -0500
Subject: [PATCH 132/213] [scripts] change default in nnet3/chain/build_tree.sh

---
 egs/wsj/s5/steps/nnet3/chain/build_tree.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
index bbff6263fe4..72bc91c6014 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -22,7 +22,9 @@ cmd=run.pl
 context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
 cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
 frame_subsampling_factor=1
-leftmost_questions_truncate=10
+leftmost_questions_truncate=-1  # note: this used to default to 10, but we never
+                                # use this option now with value != -1, and
+                                # we're changing the default
 tree_stats_opts=
 cluster_phones_opts=
 # End configuration section.
@@ -179,4 +181,3 @@ fi
 cp $dir/1.mdl $dir/final.mdl
 
 echo $0: Done building tree
-

From d5bf81b60a91cdb0a9628d5d93c48f155ddf2c46 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Jan 2017 17:06:13 -0500
Subject: [PATCH 133/213] [scripts] various minor script fixes or extensions

---
 .../s5_r2/local/chain/compare_wer_general.sh   |  1 -
 .../steps/libs/nnet3/xconfig/basic_layers.py   | 18 +++++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
index d3acae200b8..00b2d29cc88 100755
--- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -15,7 +15,6 @@ if [ "$1" == "--looped" ]; then
   shift
 fi
 
-
 used_epochs=false
 
 # this function set_names is used to separate the epoch-related parts of the name
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 3726eebeb6e..1a42c86ad81 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -441,6 +441,12 @@ class XconfigOutputLayer(XconfigLayerBase):
             -0.25 is referred to as presoftmax_prior_scale_power in scripts. In
             the scripts this would normally be set to
             config_dir/presoftmax_prior_scale.vec
+        output-delay=0    :  Can be used to shift the frames on the output, equivalent
+             to delaying labels by this many frames (positive value increases latency
+             in online decoding but may help if you're using unidirectional LSTMs.
+        ng-affine-options=''  :   Can be used supply non-default options to the affine
+             layer (intended for the natural gradient but can be an arbitrary string
+             to be added to the config line.  e.g. 'update-period=2'.).
     """
 
     def __init__(self, first_token, key_to_value, prev_names = None):
@@ -466,7 +472,8 @@ def set_default_configs(self):
                        'max-change' : 1.5,
                        'param-stddev' : 0.0,
                        'bias-stddev' : 0.0,
-                       'output-delay' : 0
+                       'output-delay' : 0,
+                       'ng-affine-options' : ''
                       }
 
     def check_configs(self):
@@ -529,6 +536,7 @@ def get_full_config(self):
         bias_stddev = self.config['bias-stddev']
         output_delay = self.config['output-delay']
         max_change = self.config['max-change']
+        ng_affine_options = self.config['ng-affine-options']
 
         # note: ref.config is used only for getting the left-context and
         # right-context of the network;
@@ -541,9 +549,9 @@ def get_full_config(self):
                     ' output-dim={2}'
                     ' param-stddev={3}'
                     ' bias-stddev={4}'
-                    ' max-change={5} '
+                    ' max-change={5} {6} '
                     ''.format(self.name, input_dim, output_dim,
-                        param_stddev, bias_stddev, max_change) +
+                              param_stddev, bias_stddev, max_change, ng_affine_options) +
                     ('learning-rate-factor={0} '.format(learning_rate_factor)
                      if learning_rate_factor != 1.0 else ''))
             ans.append((config_name, line))
@@ -690,7 +698,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
         self_repair_scale = self.config['self-repair-scale']
         target_rms = self.config['target-rms']
         max_change = self.config['max-change']
-        ng_opt_str = self.config['ng-affine-options']
+        ng_affine_options = self.config['ng-affine-options']
 
         configs = []
         # First the affine node.
@@ -701,7 +709,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                 ' max-change={3}'
                 ' {4}'
                 ''.format(self.name, input_dim, output_dim,
-                    max_change, ng_opt_str))
+                    max_change, ng_affine_options))
         configs.append(line)
 
         line = ('component-node name={0}.affine'

From 1c8cba626e2f9ce5780b502cebf52b443230b9df Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Jan 2017 20:14:44 -0500
Subject: [PATCH 134/213] [egs] Adding various tuning scripts on tedlium.

---
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   8 +-
 .../local/chain/tuning/run_tdnn_lstm_1n.sh    | 340 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1o.sh    | 344 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1r.sh    | 339 +++++++++++++++++
 .../nnet3/tuning/run_tdnn_lstm_1a_disc.sh     | 246 +++++++++++++
 src/cudamatrix/cu-sparse-matrix.h             |   4 -
 6 files changed, 1274 insertions(+), 7 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index 7f0b9588b66..32950e7df6a 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -60,13 +60,14 @@ chunk_left_context=40
 chunk_right_context=0
 chunk_left_context_initial=0
 chunk_right_context_final=0
+frames_per_chunk=140,100,160
 # decode options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
 extra_left_context=50
 extra_right_context=0
 extra_left_context_initial=0
 extra_right_context_final=0
-frames_per_chunk=140,100,160
-frames_per_chunk_primary=140
+
 
 # The rest are configs specific to this script.  Most of the parameters
 # are just hardcoded at this level, in the commands below.
@@ -74,6 +75,7 @@ train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
 tdnn_lstm_affix=1e  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=    # you can set this to use previously dumped egs.
+remove_egs=true
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -241,7 +243,7 @@ if [ $stage -le 18 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
-    --cleanup.remove-egs true \
+    --cleanup.remove-egs "$remove_egs" \
     --feat-dir $train_data_dir \
     --tree-dir $tree_dir \
     --lat-dir $lat_dir \
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
new file mode 100755
index 00000000000..ed79404f815
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+# 1n is as 1k, but maxing out at 0.5, not 0.7.
+# 1k is as 1e, but introducing a dropout schedule.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.0
+#         [looped:]           9.0       8.6       8.9       8.9
+# WER on dev(rescored)        8.4       7.9       8.2       8.2
+#         [looped:]           8.4       7.8       8.2       8.3
+# WER on test(orig)           8.8       8.8       8.9       8.9
+#         [looped:]           8.8       8.7       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.2       8.5
+#         [looped:]           8.3       8.3       8.3       8.4
+# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807
+# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931
+# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807
+# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629
+
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1n  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.5@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
new file mode 100755
index 00000000000..ec97bce3a8b
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -0,0 +1,344 @@
+#!/bin/bash
+
+# 1o is as 1k, but putting the dropout on (c,m), i.e. the output
+# of the LstmNonlinearityComponent, which I believe is the same as
+# putting it on (i,f) which Gaofeng found worked well in the non-fast Lstm
+# component; and using schedule which maxes out at 0.3, not 0.7.
+# [note: this was a little worse.  turns out it was not the same as
+# what gaofeng did because he had separate masks on (i,f).
+# note: I've since removed the script-level support for this.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m,n,o}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi tdnn_lstm1n_sp_bi tdnn_lstm1o_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.0       8.8       8.8
+#         [looped:]           9.0       8.6       8.9       8.9       8.8       8.9
+# WER on dev(rescored)        8.4       7.9       8.2       8.2       8.1       8.1
+#         [looped:]           8.4       7.8       8.2       8.3       8.1       8.2
+# WER on test(orig)           8.8       8.8       8.9       8.9       8.7       8.7
+#         [looped:]           8.8       8.7       8.8       8.8       8.7       8.7
+# WER on test(rescored)       8.4       8.3       8.2       8.5       8.3       8.2
+#         [looped:]           8.3       8.3       8.3       8.5       8.3       8.2
+# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807   -0.0702   -0.0698
+# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931   -0.0836   -0.0858
+# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807   -0.8719   -0.8998
+# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629   -0.9732   -1.0084
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1o  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-place=2 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
new file mode 100755
index 00000000000..b3da38e412a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
@@ -0,0 +1,339 @@
+#!/bin/bash
+
+# 1r is as 1e, but changing update-period of natural gradient from 4 to 1,
+# Not helpful.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,r}_sp_bi
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1r_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1r_sp_bi
+# WER on dev(orig)            9.0       9.0
+#         [looped:]           9.0       9.1
+# WER on dev(rescored)        8.4       8.5
+#         [looped:]           8.4       8.6
+# WER on test(orig)           8.8       9.1
+#         [looped:]           8.8       9.0
+# WER on test(rescored)       8.4       8.4
+#         [looped:]           8.3       8.5
+# Final train prob        -0.0648   -0.0642
+# Final valid prob        -0.0827   -0.0838
+# Final train prob (xent)   -0.8372   -0.8319
+# Final valid prob (xent)   -0.9497   -0.9635
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+frames_per_chunk=140,100,160
+# decode options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1r  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  tdnn_opts='ng-affine-options="update-period=1"'
+  lstmp_opts='ng-affine-options="update-period=1" decay-time=20'
+  output_opts='max-change=1.5 ng-affine-options="update-period=1"'
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512 $tdnn_opts
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) $tdnn_opts
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) $tdnn_opts
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) $tdnn_opts
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) $tdnn_opts
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) $tdnn_opts
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs "$remove_egs" \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
new file mode 100755
index 00000000000..1826caf3d05
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# This script does discriminative training on top of CE nnet3 system.  To
+# simplify things, this assumes you are using the "cleaned" data (since this is
+# generally better), i.e. it won't work if you used options to run_tdnn_lstm_1a.sh
+# to use the non-cleaned data.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+# below is with the current settings (effective_learning_rate=0.0000025, last_layer_factor=0.5):
+# steps/info/nnet3_disc_dir_info.pl exp/nnet3_cleaned/tdnn_lstm1a_sp_smbrslow
+# exp/nnet3_cleaned/tdnn_lstm1a_sp_smbrslow:num-jobs=4;effective-lrate=2.5e-06;last-layer-factor=0.50;iters-per-epoch=55;epoch[0,1,2,3]:train-objf=[0.94,0.96,0.97,0.97],valid-objf=[0.91,0.93,0.93,0.93],train-counts=[0.40,0.25,0.17,0.12],valid-counts=[0.57,0.31,0.34,0.35]
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1a_sp_smbrslow:{1,2,3}
+# System                tdnn_lstm1a_sp tdnn_lstm1a_sp_smbrslow:1 tdnn_lstm1a_sp_smbrslow:2 tdnn_lstm1a_sp_smbrslow:3
+# WER on dev(orig)           11.0       9.4       9.4       9.4
+#         [looped:]          11.0       9.4       9.5       9.4
+# WER on dev(rescored)       10.3       8.8       8.7       8.7
+#         [looped:]          10.3       8.8       8.9       8.9
+# WER on test(orig)          10.8       9.6       9.7       9.6
+#         [looped:]          10.7       9.6       9.6       9.7
+# WER on test(rescored)      10.1       9.1       9.2       9.1
+#         [looped:]          10.0       9.1       9.2       9.1
+
+# Below is with twice the lrate (5e-06) and the same last-layer-factor (0.5).  Trained too fast.
+# exp/nnet3_cleaned/tdnn_lstm1a_sp_smbr:num-jobs=4;effective-lrate=5e-06;last-layer-factor=0.50;iters-per-epoch=55;epoch[0,1,2,3]:train-objf=[0.94,0.97,0.97,0.98],valid-objf=[0.91,0.93,0.93,0.93],train-counts=[0.40,0.22,0.12,0.09],valid-counts=[0.57,0.31,0.27,0.32]
+#  I'm not showing the looped decoding results with this older step;
+#  there was a script bug (now fixed) and I don't want to rerun them.
+# local/nnet3/compare_wer.sh  exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1a_sp_smbr:{1,2,3}
+# System                tdnn_lstm1a_sp tdnn_lstm1a_sp_smbr:1 tdnn_lstm1a_sp_smbr:2 tdnn_lstm1a_sp_smbr:3
+# WER on dev(orig)           11.0       9.4       9.4       9.5
+# WER on dev(rescored)       10.3       8.8       8.8       8.9
+# WER on test(orig)          10.8       9.6       9.8       9.8
+# WER on test(rescored)      10.1       9.1       9.3       9.4
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+graph_dir=exp/tri3_cleaned/graph
+srcdir=exp/nnet3_cleaned/tdnn_lstm1a_sp
+train_data_dir=data/train_cleaned_sp_hires_comb
+online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# originally ran with effective_learning_rate=0.000005,
+# changing to effective_learning_rate=0.0000025 and using affix=slow
+
+# you can set --disc-affix if you run different configurations.
+disc_affix=
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # for alignments and denlat creation.
+frames_per_chunk_decoding=50  # for decoding; should be the same as the value
+                              # used in the script that trained the nnet.
+                              # We didn't set the frames_per_chunk in
+                              # run_tdnn_lstm_1a.sh, so it defaults to 50.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## note: --extra-left-context should be the same as the chunk_left_context (or in
+## general, the argument of --egs.chunk-left-context) in the baseline script.
+extra_left_context=40
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.0000025
+last_layer_factor=0.5
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --last-layer-factor $last_layer_factor \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      for iter in epoch$x epoch${x}_adj; do
+      (
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --frames-per-chunk "$frames_per_chunk_decoding" \
+        --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
+
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang data/lang_rescore data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter} \
+          $dir/decode_${decode_set}_${iter}_rescore || exit 1;
+      ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 5 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+      (
+        steps/nnet3/decode_looped.sh \
+          --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+          $graph_dir data/${decode_set}_hires $dir/decode_looped_${decode_set}_${iter} || exit 1;
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+          data/${decode_set}_hires \
+          ${dir}/decode_looped_${decode_set}_${iter} ${dir}/decode_looped_${decode_set}_${iter}_rescore || exit 1
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+exit 0;
diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h
index 1298ee5ea5f..4da74871bac 100644
--- a/src/cudamatrix/cu-sparse-matrix.h
+++ b/src/cudamatrix/cu-sparse-matrix.h
@@ -121,10 +121,6 @@ class CuSparseMatrix {
 
   ~CuSparseMatrix() { }
 
-  // Use the CuMatrix::CopyFromSmat() function to copy from this to
-  // CuMatrix.
-  // Also see CuMatrix::AddSmat().
-
  protected:
   // The following two functions should only be called if we did not compile
   // with CUDA or could not get a CUDA card; in that case the contents are

From 3a8c0a196a4e348326327ab8061d8f88e23bfd9f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Jan 2017 20:38:05 -0500
Subject: [PATCH 135/213] [egs] egs/swbd/s5c, minor script updates and new
 tuning scripts.

---
 .../s5c/local/chain/compare_wer_general.sh    | 131 +++++++--
 .../local/chain/tuning/run_tdnn_lstm_1c.sh    |   0
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    | 266 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    | 258 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    | 262 +++++++++++++++++
 .../s5c/local/nnet3/compare_wer_general.sh    |   4 +-
 6 files changed, 898 insertions(+), 23 deletions(-)
 mode change 100644 => 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh

diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
index 1b1f0d16047..29a5dc83063 100755
--- a/egs/swbd/s5c/local/chain/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -1,66 +1,155 @@
-#!/bin/bash
 
 # this script is used for comparing decoding results between systems.
-# e.g. local/chain/compare_wer_general.sh tdnn_7h_sp tdnn_7i_sp
+# e.g. local/chain/compare_wer_general.sh tdnn_c_sp tdnn_d_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:1 tdnn_d_sp_smbr:2 ...
 
-echo "$0 $*";  # print command line.
+echo "# $0 $*";  # print command line.
 
-echo -n "System               "
-for x in $*; do   printf "% 10s" $x;   done
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+
+echo -n "# System               "
+for x in $*; do   printf " % 9s" $x;   done
 echo
 
-echo -n "WER on train_dev(tg) "
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free name, like:
+#  set_names tdnn_a
+# it will set dir=exp/chain/tdnn_a and epoch_suffix=""
+# If called with something like:
+#  set_names tdnn_d_smbr:3
+# it will set dir=exp/chain/tdnn_d_smbr and epoch_suffix="epoch3"
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  name=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  dirname=exp/chain/$name
+  if [ -z $epoch ]; then
+    epoch_suffix=""
+  else
+    used_epochs=true
+    epoch_suffix=_epoch${epoch}
+  fi
+}
+
+
+echo -n "# WER on train_dev(tg) "
 for x in $*; do
-  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  # note: the '*' in the directory name is because there
+  # is _hires_ in there for the cross-entropy systems, and
+  # nothing for the sequence trained systems.
+  wer=$(grep WER $dirname/decode_train_dev*sw1_tg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep WER $dirname/decode_train_dev*sw1_tg${epoch_suffix}_looped/wer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
 
-echo -n "WER on train_dev(fg) "
+echo -n "# WER on train_dev(fg) "
 for x in $*; do
-  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep WER $dirname/decode_train_dev*sw1_fsh_fg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
-echo -n "WER on eval2000(tg)  "
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep WER $dirname/decode_train_dev*sw1_fsh_fg${epoch_suffix}_looped/wer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
+
+echo -n "# WER on eval2000(tg)  "
 for x in $*; do
-  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
-echo -n "WER on eval2000(fg)  "
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_eval2000*sw1_tg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
+
+echo -n "# WER on eval2000(fg)  "
 for x in $*; do
-  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_eval2000*sw1_fsh_fg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
+
+
+if $used_epochs; then
+  # we don't print the probs in this case.
+  exit 0
+fi
+
+
 echo -n "Final train prob     "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
 
 echo -n "Final valid prob     "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
 
 echo -n "Final train prob (xent)    "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
 
 echo -n "Final valid prob (xent)    "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10.4f" $prob
 done
 echo
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
old mode 100644
new mode 100755
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
new file mode 100755
index 00000000000..837eb944875
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_1d.sh is like run_tdnn_lstm_1c.sh but making
+# various kaldi-5.1-related upgrades to the script:
+#  change chunk-width to be variable, add extra_left_context_initial=0
+# and extra_right_context_final=0; add looped decoding.
+# Also changed frames-per-iter from 1.2 million to 1.5 million... this
+# might have been a mistake, trying 1 million in 1f to see if this matters.
+
+# run_tdnn_lstm_1c.sh is like run_tdnn_lstm_1b.sh but using the
+# new 'fast-lstm' layer.  Results are slightly improved, plus
+# it's faster.  See PR #1243 on github, and issue #1237.
+# This used to be called run_tdnn_fastlstm_1b.sh.
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1d # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
new file mode 100755
index 00000000000..bf93b156974
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -0,0 +1,258 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
new file mode 100755
index 00000000000..3d9e1e4a63b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1f.sh is like run_tdnn_lstm_1e.sh but
+# reducing the frames-per-iter from 1.5 million to 1 million,
+# since the time per iter was too much (about 5 minutes).
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1f # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b1{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/compare_wer_general.sh b/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
index 37eaeeac85b..7cf42c9ae04 100755
--- a/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
@@ -86,7 +86,7 @@ echo -n "# Final train prob     "
 for x in $*; do
   set_names $x
   prob=$(grep log-likelihood $dirname/log/compute_prob_train.combined.log | awk '{print $8}')
-  printf "% 10s" $prob
+  printf "% 10.3f" $prob
 done
 echo
 
@@ -94,6 +94,6 @@ echo -n "# Final valid prob     "
 for x in $*; do
   set_names $x
   prob=$(grep log-likelihood $dirname/log/compute_prob_valid.combined.log | awk '{print $8}')
-  printf "% 10s" $prob
+  printf "% 10.3f" $prob
 done
 echo

From 7b6b690c7e066f21811a5e9d21e0ee0b17f35db6 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 31 Jan 2017 00:33:22 -0500
Subject: [PATCH 136/213] [scripts] add utils/data/shift_feats.sh, deprecates
 steps/shift_feats.sh (#1386)

---
 egs/wsj/s5/steps/shift_feats.sh               |  5 ++
 .../s5/utils/data/shift_and_combine_feats.sh  | 55 ++++++++++++
 egs/wsj/s5/utils/data/shift_feats.sh          | 55 ++++++++++++
 src/featbin/shift-feats.cc                    | 89 +++++++++++++------
 4 files changed, 176 insertions(+), 28 deletions(-)
 create mode 100755 egs/wsj/s5/utils/data/shift_and_combine_feats.sh
 create mode 100755 egs/wsj/s5/utils/data/shift_feats.sh

diff --git a/egs/wsj/s5/steps/shift_feats.sh b/egs/wsj/s5/steps/shift_feats.sh
index 22b17f2cb09..ada5716f187 100755
--- a/egs/wsj/s5/steps/shift_feats.sh
+++ b/egs/wsj/s5/steps/shift_feats.sh
@@ -3,6 +3,9 @@
 # Copyright 2016    Vimal Manohar
 # Apache 2.0
 
+# This script is deprecated. The newer script utils/data/shift_feats.sh
+# should be used instead.
+
 # This script shifts the feats in the input data directory and creates a
 # new directory <input-data>_fs<num-frames-shift> with shifted feats.
 # If the shift is negative, the initial frames get truncated and the
@@ -25,6 +28,8 @@ if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -ne 4 ]; then
+   echo "This script is deprecated. The newer script utils/data/shift_feats.sh"
+   echo "should be used instead."
    echo "usage: $0 [options] <frame-shift> <src-data-dir> <log-dir> <path-to-storage-dir>";
    echo "e.g.: $0 -1 data/train exp/shift-1_train mfcc"
    echo "options: "
diff --git a/egs/wsj/s5/utils/data/shift_and_combine_feats.sh b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
new file mode 100755
index 00000000000..1a15b324ee8
--- /dev/null
+++ b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2017  Hossein Hadian
+
+# Apache 2.0
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <frame-subsampling-factor> <srcdir> <destdir>"
+  echo "e.g.: $0 3 data/train data/train_fs3"
+  echo "For use in perturbing data for discriminative training and alignment of"
+  echo "frame-subsampled systems, this script uses utils/data/shift_feats.sh"
+  echo "and utils/data/combine_data.sh to shift the features"
+  echo "<frame-subsampling-factor> different ways and combine them."
+  echo "E.g. if <frame-subsampling-factor> is 3, this script will combine"
+  echo "the data frame-shifted by -1, 0 and 1 (c.f. shift-feats)."
+  exit 1
+fi
+
+frame_subsampling_factor=$1
+srcdir=$2
+destdir=$3
+
+if [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: expected $srcdir/feats.scp to exist"
+  exit 1
+fi
+
+if [ -f $destdir/feats.scp ]; then
+  echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)"
+  exit 1
+fi
+
+tmp_shift_destdirs=()
+for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do
+  if [ "$frame_shift" == 0 ]; then continue; fi
+  utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1
+  tmp_shift_destdirs+=("${destdir}_fs$frame_shift")
+done
+utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1
+rm -r ${tmp_shift_destdirs[@]}
+
+utils/validate_data_dir.sh $destdir
+
+src_nf=`cat $srcdir/feats.scp | wc -l`
+dest_nf=`cat $destdir/feats.scp | wc -l`
+if [ $[src_nf*frame_subsampling_factor] -ne $dest_nf ]; then
+  echo "There was a problem. Expected number of feature lines in destination dir to be $[src_nf*frame_subsampling_factor];"
+  exit 1;
+fi
+
+echo "$0: Successfully generated $frame_subsampling_factor-way shifted version of data in $srcdir, in $destdir"
diff --git a/egs/wsj/s5/utils/data/shift_feats.sh b/egs/wsj/s5/utils/data/shift_feats.sh
new file mode 100755
index 00000000000..2ae7b2435d3
--- /dev/null
+++ b/egs/wsj/s5/utils/data/shift_feats.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2016    Vimal Manohar
+#           2017    Hossein Hadian
+# Apache 2.0
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo " Usage: $0 <frame-shift> <srcdir> <destdir>"
+  echo "e.g.: $0 -1 data/train data/train_fs-1"
+  echo "The script creates a new data directory with the features modified"
+  echo "using the program shift-feats with the specified frame-shift."
+  echo "This program automatically adds the prefix 'fs<frame-shift>-' to the"
+  echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh"
+  exit 1
+fi
+
+frame_shift=$1
+srcdir=$2
+destdir=$3
+
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+if [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: no such file $srcdir/feats.scp"
+  exit 1;
+fi
+
+utt_prefix="fs$frame_shift-"
+spk_prefix="fs$frame_shift-"
+
+mkdir -p $destdir
+utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \
+  $srcdir $destdir
+
+if grep --quiet "'" $srcdir/feats.scp; then
+  echo "$0: the input features already use single quotes. Can't proceed."
+  exit 1;
+fi
+
+awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \
+NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \
+NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \
+  $destdir/feats.scp >$destdir/feats_shifted.scp
+mv -f $destdir/feats_shifted.scp $destdir/feats.scp
+
+echo "$0: Done"
+
diff --git a/src/featbin/shift-feats.cc b/src/featbin/shift-feats.cc
index 7b970e92248..5d392c9d15a 100644
--- a/src/featbin/shift-feats.cc
+++ b/src/featbin/shift-feats.cc
@@ -22,20 +22,41 @@
 #include "util/common-utils.h"
 #include "matrix/kaldi-matrix.h"
 
+namespace kaldi {
+  void ShiftFeatureMatrix(const Matrix<BaseFloat> &src, int32 shift,
+                          Matrix<BaseFloat>* rearranged) {
+    for (int32 r = 0; r < src.NumRows(); r++) {
+      int32 src_r = r - shift;
+      if (src_r < 0) src_r = 0;
+      if (src_r >= src.NumRows()) src_r = src.NumRows() - 1;
+      rearranged->Row(r).CopyFromVec(src.Row(src_r));
+    }
+  }
+}
 
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
 
     const char *usage =
-        "Copy features and possibly shift them in time while maintaining the length, e.g.\n"
-        "shift-feats --shift=1 <input-feats> <output-feats> will shift all frames to the\n"
-        "right by one (the first frame would be duplicated).\n"
-        "See also: copy-feats, copy-matrix\n";
+        "Copy features, and possibly shift them while maintaining the "
+        "num-frames.\n"
+        "Usage: shift-feats [options] <feature-rspecifier> "
+        "<feature-wspecifier>\n"
+        "or:  shift-feats [options] <feats-rxfilename> <feats-wxfilename>\n"
+        "e.g.: shift-feats --shift=-1 foo.scp bar.ark\n"
+        "or: shift-feats --shift=1 foo.mat bar.mat\n"
+        "See also: copy-feats, copy-matrix, select-feats, extract-rows,\n"
+        "subset-feats, subsample-feats, splice-feats, paste-feats, "
+        "concat-feats\n";
 
     ParseOptions po(usage);
+    bool binary = true;
     int32 shift = 0;
-    po.Register("shift", &shift, "Number of frames by which to shift the features.");
+    po.Register("shift", &shift, "Number of frames by which to shift the "
+                                 "features.");
+    po.Register("binary", &binary, "Binary-mode output (not relevant if "
+                "writing to archive)");
 
     po.Read(argc, argv);
 
@@ -46,32 +67,40 @@ int main(int argc, char *argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1));
-    BaseFloatMatrixWriter feat_writer(po.GetArg(2));
-
-
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      const std::string &key = feat_reader.Key();
-      const Matrix<BaseFloat> &src = feat_reader.Value();
-      if (src.NumRows() == 0) {
-        KALDI_WARN << "Empty matrix for key " << key;
-        num_err++;
-        continue;
+    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1));
+      BaseFloatMatrixWriter feat_writer(po.GetArg(2));
+
+
+      for (; !feat_reader.Done(); feat_reader.Next()) {
+        const std::string &key = feat_reader.Key();
+        const Matrix<BaseFloat> &src = feat_reader.Value();
+        if (src.NumRows() == 0) {
+          KALDI_WARN << "Empty matrix for key " << key;
+          num_err++;
+          continue;
+        }
+        Matrix<BaseFloat> rearranged(src.NumRows(), src.NumCols());
+        ShiftFeatureMatrix(src, shift, &rearranged);
+        feat_writer.Write(key, rearranged);
+        num_done++;
       }
+
+      KALDI_LOG << "Shifted " << num_done << " features by "
+                << shift << " frames; " << num_err << " with errors.";
+      return (num_done > 0 ? 0 : 1);
+    } else {
+      std::string feat_rxfilename = po.GetArg(1),
+                  feat_wxfilename = po.GetArg(2);
+      Matrix<BaseFloat> src;
+      ReadKaldiObject(feat_rxfilename, &src);
+      if (src.NumRows() == 0)
+        KALDI_ERR << "Empty input matrix";
       Matrix<BaseFloat> rearranged(src.NumRows(), src.NumCols());
-      for (int32 r = 0; r < src.NumRows(); r++) {
-        int32 src_r = r - shift;
-        if (src_r < 0) src_r = 0;
-        if (src_r >= src.NumRows()) src_r = src.NumRows() - 1;
-        rearranged.Row(r).CopyFromVec(src.Row(src_r));
-      }
-      feat_writer.Write(key, rearranged);
-      num_done++;
+      ShiftFeatureMatrix(src, shift, &rearranged);
+      WriteKaldiObject(rearranged, feat_wxfilename, binary);
+      // we do not print any log messages here
     }
-
-    KALDI_LOG << "Shifted " << num_done << " features by "
-              << shift << " frames; " << num_err << " with errors.";
-    return (num_done > 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
@@ -87,4 +116,8 @@ int main(int argc, char *argv[]) {
   1 1
   1 1
   2 2 ]
+
+
+  echo "[ 1 1; 2 2; 3 3 ]" | ./shift-feats --print-args=false --binary=false \
+    --shift=1 - -
 */

From 3da81693234da72e4c9a004c2429159fe2cfb005 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Jan 2017 23:11:16 -0500
Subject: [PATCH 137/213] [egs] Small fixes/additions in Swbd/s5c chain scripts

---
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   4 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    | 261 ++++++++++++++++++
 2 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index bf93b156974..14dbb1cdd2e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -242,11 +242,11 @@ if [ $stage -le 16 ]; then
          --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
-         $dir/decode_${decode_set}_sw1_tg || exit 1;
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
       if $has_fisher; then
           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
       fi
       ) &
   done
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
new file mode 100755
index 00000000000..6cacdf2dadb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+# 1g is like 1e, but reducing decay-time from 20 to 15, to see if
+# it reduces the difference between regular and looped decoding.
+#
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1g # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=15"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;

From 7f38a5ea96949f3e32cdfa3d453d9f352d1ffbfd Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 1 Feb 2017 18:59:08 -0500
Subject: [PATCH 138/213] [src,egs,scripts]: various minor fixes: make
 num-epochs continuous; add decay-time to other LSTM types; bug-fix in nnet3
 combination code; swbd/s5c results added.

---
 .../s5c/local/chain/compare_wer_general.sh    |  8 +--
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    | 17 +++++++
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 16 ++++--
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 51 +++++++++++++++----
 egs/wsj/s5/steps/nnet3/chain/train.py         |  6 +--
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  2 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  2 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  2 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  2 +-
 src/nnet3/nnet-chain-combine.cc               |  2 +-
 src/nnet3/nnet-combine.cc                     |  3 +-
 11 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
index 29a5dc83063..f56cbfb8675 100755
--- a/egs/swbd/s5c/local/chain/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -126,28 +126,28 @@ if $used_epochs; then
 fi
 
 
-echo -n "Final train prob     "
+echo -n "# Final train prob     "
 for x in $*; do
   prob=$(grep Overall exp/chain/${x}/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
   printf "% 10.3f" $prob
 done
 echo
 
-echo -n "Final valid prob     "
+echo -n "# Final valid prob     "
 for x in $*; do
   prob=$(grep Overall exp/chain/${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
   printf "% 10.3f" $prob
 done
 echo
 
-echo -n "Final train prob (xent)    "
+echo -n "# Final train prob (xent)    "
 for x in $*; do
   prob=$(grep Overall exp/chain/${x}/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
   printf "% 10.3f" $prob
 done
 echo
 
-echo -n "Final valid prob (xent)    "
+echo -n "# Final valid prob (xent)    "
 for x in $*; do
   prob=$(grep Overall exp/chain/${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
   printf "% 10.4f" $prob
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
index 3d9e1e4a63b..b8f1fdd92f6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -8,6 +8,23 @@
 # trying the change of xent_regularize from 0.025 (which was an
 # unusual value) to the more usual 0.01.
 
+# WER is worse but this seems to be due to more complete optimization
+# (train better, valid worse).  Looks like we may be overtraining.
+#
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# WER on train_dev(tg)      12.74     13.23
+#           [looped:]       12.93     13.27
+# WER on train_dev(fg)      11.70     12.17
+#           [looped:]       12.09     12.42
+# WER on eval2000(tg)        15.7      16.1
+#           [looped:]        15.9      16.2
+# WER on eval2000(fg)        14.3      14.6
+#           [looped:]        14.6      14.7
+# Final train prob         -0.066    -0.065
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.916
+# Final valid prob (xent)       -1.0279   -1.0359
 
 
 set -e
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 50bdc780a20..5e328ad1894 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -438,16 +438,24 @@ def verify_iterations(num_iters, num_epochs, num_hidden_layers,
                         "layer-wise discriminatory training.")
 
     approx_iters_per_epoch_final = num_archives/num_jobs_final
+    # Note: it used to be that we would combine over an entire epoch,
+    # but in practice we very rarely would use any weights from towards
+    # the end of that range, so we are changing it to use not
+    # approx_iters_per_epoch_final, but instead:
+    # approx_iters_per_epoch_final/2 + 1,
+    # dividing by 2 to use half an epoch, and adding 1 just to make sure
+    # it's not zero.
+
     # First work out how many iterations we want to combine over in the final
     # nnet3-combine-fast invocation.
     # The number we use is:
-    # min(max(max_models_combine, approx_iters_per_epoch_final),
+    # min(max(max_models_combine, approx_iters_per_epoch_final/2+1),
     #     1/2 * iters_after_last_layer_added)
     # But if this value is > max_models_combine, then the models
     # are subsampled to get these many models to combine.
     half_iters_after_add_layers = (num_iters - finish_add_layers_iter)/2
 
-    num_iters_combine_initial = min(approx_iters_per_epoch_final,
+    num_iters_combine_initial = min(approx_iters_per_epoch_final/2 + 1,
                                     half_iters_after_add_layers)
 
     if num_iters_combine_initial > max_models_combine:
@@ -647,8 +655,8 @@ def __init__(self,
                                  other random seeds used in other stages of the
                                  experiment like data preparation (e.g. volume
                                  perturbation).""")
-        self.parser.add_argument("--trainer.num-epochs", type=int,
-                                 dest='num_epochs', default=8,
+        self.parser.add_argument("--trainer.num-epochs", type=float,
+                                 dest='num_epochs', default=8.0,
                                  help="Number of epochs to train the model")
         self.parser.add_argument("--trainer.shuffle-buffer-size", type=int,
                                  dest='shuffle_buffer_size', default=5000,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 9d7f649c4b4..4ffebcd9436 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -30,6 +30,15 @@
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
 #   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "lstm-layer"
@@ -44,7 +53,8 @@ def set_default_configs(self):
                         'ng-affine-options' : ' max-change=0.75 ',
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 15.0
+                        'zeroing-threshold' : 15.0,
+                        'decay-time':  -1.0
                         }
 
     def set_derived_configs(self):
@@ -108,17 +118,23 @@ def generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         delay = self.config['delay']
-
-        repair_nonlin = self.config['self-repair-scale-nonlinearity']
-        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
+                                abs(delay), recurrence_scale))
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
         affine_str = self.config['ng-affine-options']
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
@@ -233,6 +249,15 @@ def generate_lstm_config(self):
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
 #   ng-affine-options=''              [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "lstmp-layer"
@@ -252,7 +277,8 @@ def set_default_configs(self):
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
                         'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
-                        'dropout-per-frame' : False  # If false, regular dropout, not per frame.
+                        'dropout-per-frame' : False,  # If false, regular dropout, not per frame.
+                        'decay-time':  -1.0
                        }
 
     def set_derived_configs(self):
@@ -342,14 +368,21 @@ def generate_lstm_config(self):
         delay = self.config['delay']
         repair_nonlin = self.config['self-repair-scale-nonlinearity']
         repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
+                                abs(delay), recurrence_scale))
         affine_str = self.config['ng-affine-options']
         pes_str = self.config['ng-per-element-scale-options']
         dropout_proportion = self.config['dropout-proportion']
@@ -578,7 +611,6 @@ def generate_lstm_config(self):
                             1.0 - (abs(delay) / decay_time))
         assert recurrence_scale > 0   # or user may have set decay-time much
                                       # too small.
-        lstm_str = self.config['lstm-nonlinearity-options']
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
@@ -588,6 +620,8 @@ def generate_lstm_config(self):
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
                                 abs(delay), recurrence_scale))
+        lstm_str = self.config['lstm-nonlinearity-options']
+
 
         configs = []
 
@@ -772,7 +806,6 @@ def generate_lstm_config(self):
                             1.0 - (abs(delay) / decay_time))
         assert recurrence_scale > 0   # or user may have set decay-time much
                                       # too small.
-
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 1791aee665b..8624dc947b9 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -101,8 +101,8 @@ def get_args():
                         help="Deprecated. Kept for back compatibility")
 
     # trainer options
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default=10,
+    parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs',
+                        default=10.0,
                         help="Number of epochs to train the model")
     parser.add_argument("--trainer.frames-per-iter", type=int,
                         dest='frames_per_iter', default=800000,
@@ -391,7 +391,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
-    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 9874b03051a..c400442d429 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -271,7 +271,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_expanded = num_archives * args.frames_per_eg
-    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 48e647d9c5e..0264b409e46 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -275,7 +275,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_expanded = num_archives * args.frames_per_eg
-    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 97ab378f5fd..5a96d6020fa 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -341,7 +341,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
-    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_to_process = int(args.num_epochs * num_archives)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 19da38db958..5824a77dbfe 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -338,7 +338,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
-    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_to_process = int(args.num_epochs * num_archives)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index 09f01cd947b..c93858fb06e 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -185,7 +185,7 @@ void NnetChainCombiner::Combine() {
 
 
 void NnetChainCombiner::PrintParams(const VectorBase<double> &params) const {
-  Vector<double> weights(params.Dim()), normalized_weights(params.Dim());
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   int32 num_models = nnet_params_.NumRows(),
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 207cfbe8269..a63e75f91c6 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -181,8 +181,7 @@ void NnetCombiner::Combine() {
 
 
 void NnetCombiner::PrintParams(const VectorBase<double> &params) const {
-
-  Vector<double> weights(params.Dim()), normalized_weights(params.Dim());
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   int32 num_models = nnet_params_.NumRows(),

From ed2cedc9a030db0c8c948ca57c58141f4280bd75 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 2 Feb 2017 20:21:02 -0500
Subject: [PATCH 139/213] [egs,scripts]: add Swbd/s5c tuning scripts; simplify
 nnet3+chain 'combination' stage (doesn't affect results; faster); minor
 info-script fix.

---
 egs/swbd/s5c/local/chain/run_tdnn_lstm.sh     |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1c.sh    |  22 +-
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    |  22 ++
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |  17 +
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    |  23 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    |  21 ++
 .../local/chain/tuning/run_tdnn_lstm_1h.sh    | 279 ++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1i.sh    | 300 ++++++++++++++++++
 egs/wsj/s5/steps/info/chain_dir_info.pl       |   2 +-
 .../nnet3/train/chain_objf/acoustic_model.py  |  13 +-
 10 files changed, 686 insertions(+), 15 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh

diff --git a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
index 9669251c14a..fbc28248491 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1c.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1e.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
index b305c57b6ab..d71301eb102 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -5,15 +5,19 @@
 # it's faster.  See PR #1243 on github, and issue #1237.
 # This used to be called run_tdnn_fastlstm_1b.sh.
 
-#System               tdnn_lstm_1a_ld5 tdnn_lstm_1b_ld5 tdnn_lstm_1c_ld5
-#WER on train_dev(tg)      13.42           13.00             12.91
-#WER on train_dev(fg)      12.42           12.03             11.98
-#WER on eval2000(tg)        15.7           15.3              15.2
-#WER on eval2000(fg)        14.2           13.9              13.8
-#Final train prob     -0.0538088      -0.056294            -0.050
-#Final valid prob     -0.0800484      -0.0813322           -0.092
-#Final train prob (xent)   -0.7603    -0.777787            -0.756
-#Final valid prob (xent)   -0.949909  -0.939146            -0.983
+## note: the last column below was this run on Feb 1 2017, in the
+## shortcut branch.  Results are a bit worse, but I believe this is just
+## random noise or a little bit of mean-regression.
+
+#System               tdnn_lstm_1a_ld5_sp tdnn_lstm_1b_ld5_sp tdnn_lstm_1c_ld5_sp tdnn_lstm_1c_ld5_sp
+#WER on train_dev(tg)      13.42           13.00             12.91         13.17
+#WER on train_dev(fg)      12.42           12.03             11.98         12.25
+#WER on eval2000(tg)        15.7           15.3              15.2          15.4
+#WER on eval2000(fg)        14.2           13.9              13.8          14.1
+#Final train prob     -0.0538088      -0.056294            -0.050          -0.046
+#Final valid prob     -0.0800484      -0.0813322           -0.092          -0.073
+#Final train prob (xent)   -0.7603    -0.777787            -0.756          -0.749
+#Final valid prob (xent)   -0.949909  -0.939146            -0.983          -0.980
 
 set -e
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
index 837eb944875..22c7d2e582d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -8,6 +8,28 @@
 # Also changed frames-per-iter from 1.2 million to 1.5 million... this
 # might have been a mistake, trying 1 million in 1f to see if this matters.
 
+# The comparison below is with a version of the 1c system that was run at about
+# the same time.  The degradation in log-likelihood and xent prob is likely because
+# now on average the chunk-size is slightly smaller than before (150 -> 136);
+# possibly the change in extra-(left,right) context has a similar effect
+# (or maybe it's just because the validation and train-subset examples have changed).
+
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1c_ld5_sp tdnn_lstm_1d_sp
+# System                tdnn_lstm_1c_ld5_sp tdnn_lstm_1d_sp
+# WER on train_dev(tg)      13.17     12.90
+#           [looped:]                 13.01
+# WER on train_dev(fg)      12.25     11.90
+#           [looped:]                 12.13
+# WER on eval2000(tg)        15.4      15.7
+#           [looped:]                  15.7
+# WER on eval2000(fg)        14.1      14.2
+#           [looped:]                  14.4
+# Final train prob         -0.046    -0.064
+# Final valid prob         -0.073    -0.088
+# Final train prob (xent)        -0.749    -0.836
+# Final valid prob (xent)       -0.9084   -0.9631
+
 # run_tdnn_lstm_1c.sh is like run_tdnn_lstm_1b.sh but using the
 # new 'fast-lstm' layer.  Results are slightly improved, plus
 # it's faster.  See PR #1243 on github, and issue #1237.
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index 14dbb1cdd2e..f8b3d70aa2b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -4,6 +4,23 @@
 # trying the change of xent_regularize from 0.025 (which was an
 # unusual value) to the more usual 0.01.
 
+# There seems to be no consistent difference in WER.  Inconclusive.
+# However I may keep 0.01 just for consistency with other setups.
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1d_sp tdnn_lstm_1e_sp
+# System                tdnn_lstm_1d_sp tdnn_lstm_1e_sp
+# WER on train_dev(tg)      12.90     12.74
+#           [looped:]       13.01     12.93
+# WER on train_dev(fg)      11.90     11.70
+#           [looped:]       12.13     12.09
+# WER on eval2000(tg)        15.7      15.7
+#           [looped:]        15.7      15.9
+# WER on eval2000(fg)        14.2      14.3
+#           [looped:]        14.4      14.6
+# Final train prob         -0.064    -0.066
+# Final valid prob         -0.088    -0.087
+# Final train prob (xent)        -0.836    -0.931
+# Final valid prob (xent)       -0.9631   -1.0279
+
 
 
 set -e
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
index b8f1fdd92f6..90e179379e4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -2,7 +2,28 @@
 
 # run_tdnn_lstm_1f.sh is like run_tdnn_lstm_1e.sh but
 # reducing the frames-per-iter from 1.5 million to 1 million,
-# since the time per iter was too much (about 5 minutes).
+# since the time per iter was more than usual (about 5 minutes).
+
+# Below, the WER seems to get a little worse, although the optimization
+# is improved slightly.  There seems to be more train/valid difference.
+# see also 1i.
+
+# exp/chain/tdnn_lstm_1f_sp: num-iters=392 nj=3..16 num-params=39.6M dim=40+100->6042 combine=-0.080->-0.073 xent:train/valid[260,391,final]=(-1.06,-0.903,-0.916/-1.13,-1.03,-1.04) logprob:train/valid[260,391,final]=(-0.084,-0.064,-0.065/-0.100,-0.091,-0.090)
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# WER on train_dev(tg)      12.74     13.23
+#           [looped:]       12.93     13.27
+# WER on train_dev(fg)      11.70     12.17
+#           [looped:]       12.09     12.42
+# WER on eval2000(tg)        15.7      16.1
+#           [looped:]        15.9      16.2
+# WER on eval2000(fg)        14.3      14.6
+#           [looped:]        14.6      14.7
+# Final train prob         -0.066    -0.065
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.916
+# Final valid prob (xent)       -1.0279   -1.0359
 
 # run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
 # trying the change of xent_regularize from 0.025 (which was an
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
index 6cacdf2dadb..cb73f020e3e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -3,6 +3,27 @@
 # 1g is like 1e, but reducing decay-time from 20 to 15, to see if
 # it reduces the difference between regular and looped decoding.
 #
+# There doesn't seem to be a very consistent difference betwen 1e and 1g.
+
+
+# exp/chain/tdnn_lstm_1g_sp: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6042 combine=-0.083->-0.076 xent:train/valid[173,261,final]=(-1.09,-0.929,-0.938/-1.15,-1.04,-1.05) logprob:train/valid[173,261,final]=(-0.089,-0.066,-0.067/-0.102,-0.089,-0.090)
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1g_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1g_sp
+# WER on train_dev(tg)      12.74     13.03
+#           [looped:]       12.93     12.98
+# WER on train_dev(fg)      11.70     12.02
+#           [looped:]       12.09     12.13
+# WER on eval2000(tg)        15.7      15.6
+#           [looped:]        15.9      15.9
+# WER on eval2000(fg)        14.3      14.1
+#           [looped:]        14.6      14.4
+# Final train prob         -0.066    -0.067
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.938
+# Final valid prob (xent)       -1.0279   -1.0473
+
+
 # run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
 # trying the change of xent_regularize from 0.025 (which was an
 # unusual value) to the more usual 0.01.
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
new file mode 100755
index 00000000000..b12be22ce3d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+
+# 1h is like 1e, but reducing the hidden-dims from 1024 to 880.
+
+# Does not seem to help; both train and valid probs get worse by about
+# the same amount, and WER is overall just slightly worse.  Maybe 1024
+# was approximately optimal.
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1h_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1h_sp
+# WER on train_dev(tg)      12.74     13.06
+#           [looped:]       12.93     13.17
+# WER on train_dev(fg)      11.70     12.13
+#           [looped:]       12.09     12.27
+# WER on eval2000(tg)        15.7      15.7
+#           [looped:]        15.9      15.9
+# WER on eval2000(fg)        14.3      14.4
+#           [looped:]        14.6      14.5
+# Final train prob         -0.066    -0.069
+# Final valid prob         -0.087    -0.091
+# Final train prob (xent)        -0.931    -0.967
+# Final valid prob (xent)       -1.0279   -1.0631
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1h # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=880
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=880
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=880
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=880 recurrent-projection-dim=220 non-recurrent-projection-dim=220 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=880
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=880
+  fast-lstmp-layer name=fastlstm2 cell-dim=880 recurrent-projection-dim=220 non-recurrent-projection-dim=220 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=880
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=880
+  fast-lstmp-layer name=fastlstm3 cell-dim=880 recurrent-projection-dim=220 non-recurrent-projection-dim=220 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
new file mode 100755
index 00000000000..7e05834c1fb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1i.sh is like run_tdnn_lstm_1{e,f}.sh but
+# with a different frames-per-iter: 2 million, vs. 1.5 million
+# (1e) and 1 million (1f)
+
+# Results are inconclusive regarding comparison with 1e: it's [0.3 worse, 0.1
+# better, 0.2 worse, same, 0.2 better, 0.2 better, 0.3 better, 0.3 better] on
+# the different conditions.  There is less train/valid difference and worse
+# train prob [the trends of valid and train probs are consistent as we change
+# the frames-per-iter].
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1{e,f,i}_sp 2>/dev/null
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp tdnn_lstm_1i_sp
+# WER on train_dev(tg)      12.74     13.23     13.08
+#           [looped:]       12.93     13.27     13.00
+# WER on train_dev(fg)      11.70     12.17     11.97
+#           [looped:]       12.09     12.42     12.08
+# WER on eval2000(tg)        15.7      16.1      15.5
+#           [looped:]        15.9      16.2      15.7
+# WER on eval2000(fg)        14.3      14.6      14.0
+#           [looped:]        14.6      14.7      14.3
+# Final train prob         -0.066    -0.065    -0.069
+# Final valid prob         -0.087    -0.090    -0.088
+# Final train prob (xent)        -0.931    -0.916    -0.947
+# Final valid prob (xent)       -1.0279   -1.0359   -1.0419
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+# WER is worse but this seems to be due to more complete optimization
+# (train better, valid worse).  Looks like we may be overtraining.
+#
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# WER on train_dev(tg)      12.74     13.23
+#           [looped:]       12.93     13.27
+# WER on train_dev(fg)      11.70     12.17
+#           [looped:]       12.09     12.42
+# WER on eval2000(tg)        15.7      16.1
+#           [looped:]        15.9      16.2
+# WER on eval2000(fg)        14.3      14.6
+#           [looped:]        14.6      14.7
+# Final train prob         -0.066    -0.065
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.916
+# Final valid prob (xent)       -1.0279   -1.0359
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1i # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b1{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl
index 1d659b89c89..b0adb7e498c 100755
--- a/egs/wsj/s5/steps/info/chain_dir_info.pl
+++ b/egs/wsj/s5/steps/info/chain_dir_info.pl
@@ -136,7 +136,7 @@ sub get_combine_info {
     while (<F>) {
       if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
-        return sprintf(" combine=%.2f->%.2f", $1, $2);
+        return sprintf(" combine=%.3f->%.3f", $1, $2);
       }
     }
   }
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index fde8ae65461..f28aa89774e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -543,19 +543,24 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     models_to_combine.add(num_iters)
 
+    # TODO: if it turns out the sum-to-one-penalty code is not useful,
+    # remove support for it.
+
     for iter in sorted(models_to_combine):
         model_file = '{0}/{1}.mdl'.format(dir, iter)
         if os.path.exists(model_file):
-            raw_model_strings.append(
-                '"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+            # we used to copy them with nnet3-am-copy --raw=true, but now
+            # the raw-model-reading code discards the other stuff itself.
+            raw_model_strings.append(model_file)
         else:
             print("{0}: warning: model file {1} does not exist "
                   "(final combination)".format(sys.argv[0], model_file))
 
     common_lib.run_job(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-chain-combine --num-iters=80 \
+                nnet3-chain-combine --num-iters={opt_iters} \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+                --separate-weights-per-component={separate_weights} \
                 --enforce-sum-to-one={hard_enforce} \
                 --sum-to-one-penalty={penalty} \
                 --enforce-positive-weights=true \
@@ -568,6 +573,8 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                 {dir}/final.mdl""".format(
                     command=run_opts.command,
                     combine_queue_opt=run_opts.combine_queue_opt,
+                    opt_iters=(20 if sum_to_one_penalty <= 0 else 80),
+                    separate_weights=(sum_to_one_penalty > 0),
                     lc=left_context, rc=right_context,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),

From 954815a248ae524722156d50bf43e0a99aad3b12 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Wed, 8 Feb 2017 10:48:06 -0800
Subject: [PATCH 140/213] Resolve conflicts due to cross compilation changes in
 master (#1400)

* [build]: resolving OpenFst compilation issue with  gcc-6.x (#1392)

* [egs] Add new graphemic system for Gale Arabic, with newer nnet scripts (#1298)

* [build] Windows build: generate missing base/version.h; cosmetic changes (#1397)

* [build]: Enable cross compilation, including to android. (#726)

If a user has a number of tool chains installed and they do not want to
use the default, they must currently edit the kaldi.mk file after
running configure to change the CC, CXX, AR, AS, and RANLIB variables.
This is something that should be exposed via the configure script. This
patch exposes an option to set the host triple for the desired tool
chain in the configure script.

Building Kaldi on my Raspberry Pi boards is not particularly fast.  I
have been using the following patch to build kaldi executables for use
on the Pi boards for the better part of a year.  A typical invocation
for me is something like:

$ ./configure --static --atlas-root=/opt/cross/armv8hf \
--fst-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf \
--fst-version=1.4.1

This way I can build on my much faster x86 desktop, but still run
experiments on ARM.

I have included support for cross compiling for ppc64le and it works for
me (at least it produces binaries for ppc64le I don't have a ppc64
machine to test it).

Signed-off-by: Eric B Munson <eric@cobaltspeech.com>

* Add mk file and configure options for building for Android

Building for Android requires a toolchain that can be built using the
Android NDK.  It works similiarly to the linux build except that it only
uses clang, only supports the openBLAS math library, and requires an
additional include directory for the system C++ headers.

A typical configure invocation looks like:

./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
--fst-root=/opt/cross/arm-linux-androideabi \
--host=arm-linux-androideabi --fst-version=1.4.1 \
--android-includes=/opt/cross/arm-linux-androideabi/sysroot/usr/include

Signed-off-by: Eric B Munson <eric@cobaltspeech.com>

* Make pthread cancel symbols noops for Android

The Android C library does not support cancelling pthreads so the
symbols PTHREAD_CANCEL_STATE and pthread_setcancelstate are undefined.
Because a pthread cannot be cancelled in Android, it is reasonable to
make the pthread_setcancelstate() call a noop.

Signed-off-by: Eric B Munson <eric@cobaltspeech.com>

* A few small fixes for configure

* Reword the clang++ requirement for android builds.
---
 src/configure                           | 651 ++++++++++++++----------
 src/makefiles/android_openblas.mk       |  74 +--
 src/makefiles/cygwin.mk                 |   4 -
 src/makefiles/darwin.mk                 |   4 -
 src/makefiles/linux_atlas.mk            |   4 -
 src/makefiles/linux_atlas_arm.mk        |   4 -
 src/makefiles/linux_atlas_ppc64le.mk    |   4 -
 src/makefiles/linux_clapack.mk          |   4 -
 src/makefiles/linux_clapack_arm.mk      |   4 -
 src/makefiles/linux_openblas.mk         |   4 -
 src/makefiles/linux_openblas_arm.mk     |   4 -
 src/makefiles/linux_openblas_ppc64le.mk |   4 -
 src/makefiles/linux_x86_64_mkl.mk       |   4 -
 windows/get_version.pl                  |   2 +-
 14 files changed, 421 insertions(+), 350 deletions(-)

diff --git a/src/configure b/src/configure
index bf478b5b73f..a4f3ce1c8b3 100755
--- a/src/configure
+++ b/src/configure
@@ -15,17 +15,32 @@
 # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes
 # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb
 #        # This is for MKL 11.3, which does not seem  to provide Intel OMP libs
-# ./configure  --openblas-root=../tools/OpenBLAS/install  # before doing
-#        # this, cd to ../tools and type "make openblas".  Note:
-#        # this is not working correctly on all platforms, do "make test"
+# ./configure --openblas-root=../tools/OpenBLAS/install
+#        # Before doing this, cd to ../tools and type "make openblas".
+#        # Note: this is not working correctly on all platforms, do "make test"
 #        # and look out for segmentation faults.
 # ./configure --atlas-root=../tools/ATLAS/build
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
 #                             # version of kaldi even on CUDA-enabled machine
+# ./configure --static --fst-root=/opt/cross/armv8hf \
+# --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
+#        # Cross compile for armv8hf, this assumes that you have openfst built
+#        # with the armv8-rpi3-linux-gnueabihf toolchain and installed to
+#        # /opt/cross/armv8hf. It also assumes that you have an ATLAS library
+#        # built for the target install to /opt/cross/armv8hf and that the
+#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path
+# ./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
+# --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.4.1 \
+# --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
+# --host=arm-linux-androideabi
+#        # Cross compile for Android on arm. The only difference here is the
+#        # addition of the the --android-includes flag because the toolchains
+#        # produced by the Android NDK don't always include the C++ stdlib
+#        # headers in the normal cross compile include path.
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
-CONFIGURE_VERSION=5
+CONFIGURE_VERSION=6
 
 if ! [ -x "$PWD/configure" ]; then
   echo 'You must run "configure" from the src/ directory.'
@@ -42,35 +57,46 @@ The default configuration is to build and link against static Kaldi libraries.
 OpenFst and Math libraries are linked dynamically.
 
 Configuration options:
-  --help               Display this help message and exit
-  --version            Display the version of 'configure' and exit
-  --static             Build and link against static libraries [default=no]
-  --shared             Build and link against shared libraries [default=no]
-  --use-cuda           Build with CUDA [default=yes]
-  --cudatk-dir=DIR     CUDA toolkit directory
-  --double-precision   Build with double precision numbers [default=no]
-  --static-fst         Build with static OpenFst libraries [default=no]
-  --fst-root=DIR       OpenFst root directory [default=../tools/openfst/]
-  --mathlib=LIB        Math library [default=ATLAS]
-                       Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
-  --static-math        Build with static math libraries [default=no]
-  --threaded-math      Build with multi-threaded math libraries [default=no]
-  --threaded-atlas     Build with multi-threaded ATLAS libraries [default=no]
-  --atlas-root=DIR     ATLAS root directory [default=../tools/ATLAS/]
-  --openblas-root=DIR  OpenBLAS root directory
-  --clapack-root=DIR   CLAPACK root directory
-  --mkl-root=DIR       MKL root directory
-  --mkl-libdir=DIR     MKL library directory
-  --mkl-threading=LIB  MKL threading layer [default=sequential]
-                       Supported layers: sequential, iomp, tbb, gomp.
-  --omp-libdir=DIR     OpenMP directory
-  --speex-root=DIR     SPEEX root directory
-  --speex-libdir=DIR   SPEEX library directory
-  --speex-incdir=DIR   SPEEX include directory
-
-Following environment variables can be used to override the compiler
-or to provide additional flags to the compiler/linker.
-  CXX         C++ compiler command
+  --help                Display this help message and exit
+  --version             Display the version of 'configure' and exit
+  --static              Build and link against static libraries [default=no]
+  --shared              Build and link against shared libraries [default=no]
+  --use-cuda            Build with CUDA [default=yes]
+  --cudatk-dir=DIR      CUDA toolkit directory
+  --double-precision    Build with double precision floats [default=no]
+  --static-fst          Build with static OpenFst libraries [default=no]
+  --fst-root=DIR        OpenFst root directory [default=../tools/openfst/]
+  --fst-version=STR     OpenFst version string
+  --mathlib=LIB         Math library [default=ATLAS]
+                        Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
+  --static-math         Build with static math libraries [default=no]
+  --threaded-math       Build with multi-threaded math libraries [default=no]
+  --threaded-atlas      Build with multi-threaded ATLAS libraries [default=no]
+  --atlas-root=DIR      ATLAS root directory [default=../tools/ATLAS/]
+  --openblas-root=DIR   OpenBLAS root directory
+  --clapack-root=DIR    CLAPACK root directory
+  --mkl-root=DIR        MKL root directory
+  --mkl-libdir=DIR      MKL library directory
+  --mkl-threading=LIB   MKL threading layer [default=sequential]
+                        Supported layers: sequential, iomp, tbb, gomp.
+  --omp-libdir=DIR      OpenMP directory
+  --speex-root=DIR      SPEEX root directory
+  --speex-libdir=DIR    SPEEX library directory
+  --speex-incdir=DIR    SPEEX include directory
+  --host=HOST           Host triple in the format 'cpu-vendor-os'
+                        If provided, it is prepended to all toolchain programs.
+  --android-incdir=DIR  Andraid include directory
+
+Following environment variables can be used to override the default toolchain.
+  CXX         C++ compiler [default=g++]
+  AR          Archive maintenance utility [default=ar]
+  AS          Assembler [default=as]
+  RANLIB      Archive indexing utility [default=ranlib]
+
+If a host triple is provided, it is prepended to CXX, AR, AS and RANLIB.
+
+Following environment variables can be used to provide additional flags to the
+compiler/linker.
   CXXFLAGS    Additional C++ compiler flags, e.g. -I<include-dir>
   LDFLAGS     Additional linker flags, e.g. -L<lib-dir>
   LDLIBS      Additional libraries to pass to the linker, e.g. -l<lib>
@@ -111,6 +137,16 @@ function check_exists {
   if [ ! -f $1 ]; then failure "$1 not found."; fi
 }
 
+function check_library {
+  local libpath=$1
+  local libname=$2
+  local libext=$3
+  local full_libname="$libpath/$libname.$libext"
+  ##echo "Testing $full_libname" >&2
+  test -f "$full_libname" && return ;
+  return 1
+}
+
 function check_compiler {
   COMPILER=$1
   if ! which $COMPILER >&/dev/null; then
@@ -151,190 +187,21 @@ function check_compiler {
 }
 
 function check_for_slow_expf {
-  cd probe
-  rm -f exp-test
-  make -f Makefile.slow_expf 1>/dev/null
-  ./exp-test
-  if [ $? -eq 1 ]; then
-    echo "*** WARNING: expf() seems to be slower than exp() on your machine. This is a known bug in old versions of glibc. Please consider updating glibc. ***"
-    echo "*** Kaldi will be configured to use exp() instead of expf() in base/kaldi-math.h Exp() routine for single-precision floats. ***"
-    echo "CXXFLAGS += -DKALDI_NO_EXPF" >> ../kaldi.mk
+  # We cannot run this test if we are cross compiling.
+  if [[ "$TARGET_ARCH" == "`uname -m`" ]] ; then
+    cd probe
+    rm -f exp-test
+    make -f Makefile.slow_expf 1>/dev/null
+    ./exp-test
+    if [ $? -eq 1 ]; then
+        echo "*** WARNING: expf() seems to be slower than exp() on your machine. This is a known bug in old versions of glibc. Please consider updating glibc. ***"
+        echo "*** Kaldi will be configured to use exp() instead of expf() in base/kaldi-math.h Exp() routine for single-precision floats. ***"
+        echo "CXXFLAGS += -DKALDI_NO_EXPF" >> ../kaldi.mk
+    fi
+    cd ..
   fi
-  cd ..
-}
-
-function check_library {
-  local libpath=$1
-  local libname=$2
-  local libext=$3
-  local full_libname="$libpath/$libname.$libext"
-  ##echo "Testing $full_libname" >&2
-  test -f "$full_libname" && return ;
-  return 1
 }
 
-# If configuration sets any of these variables, we will switch the external
-# math library. Here we unset them so that we can check later.
-unset MKLROOT
-unset CLAPACKROOT
-unset OPENBLASROOT
-unset MKLLIBDIR
-
-# These environment variables are OK.
-CXX=${CXX:-g++}
-ENV_CXXFLAGS=$CXXFLAGS
-ENV_LDFLAGS=$LDFLAGS
-ENV_LDLIBS=$LDLIBS
-
-# Default configuration
-dynamic_kaldi=false
-use_cuda=true
-static_fst=false
-static_math=false
-threaded_atlas=false
-mkl_threading=sequential
-double_precision=false
-
-MATHLIB='ATLAS'
-ATLASROOT=`rel2abs ../tools/ATLAS/`
-FSTROOT=`rel2abs ../tools/openfst`
-
-# Save the command line to include in kaldi.mk
-cmd_line="$0 $@"
-
-while [ $# -gt 0 ];
-do
-  case "$1" in
-  --help)
-    usage; exit 0 ;;
-  --version)
-    echo $CONFIGURE_VERSION; exit 0 ;;
-  --static)
-    dynamic_kaldi=false;
-    static_math=true;
-    static_fst=true;
-    shift ;;
-  --shared)
-    dynamic_kaldi=true;
-    static_math=false;
-    static_fst=false;
-    shift ;;
-  --double-precision)
-    double_precision=true;
-    shift ;;
-  --double-precision=yes)
-    double_precision=true;
-    shift ;;
-  --double-precision=no)
-    double_precision=false;
-    shift ;;
-  --atlas-root=*)
-    ATLASROOT=`read_dirname $1`;
-    shift ;;
-  --threaded-atlas)
-    threaded_atlas=true;
-    shift ;;
-  --threaded-atlas=yes)
-    threaded_atlas=true;
-    shift ;;
-  --threaded-atlas=no)
-    threaded_atlas=false;
-    shift ;;
-  --threaded-math)
-    threaded_atlas=true;
-    mkl_threading=iomp
-    shift ;;
-  --threaded-math=yes)
-    threaded_atlas=true;
-    mkl_threading=iomp
-    shift ;;
-  --threaded-math=no)
-    threaded_atlas=false;
-    mkl_threading=sequential
-    shift ;;
-  --use-cuda)
-    use_cuda=true;
-    shift ;;
-  --use-cuda=yes)
-    use_cuda=true;
-    shift ;;
-  --use-cuda=no)
-    use_cuda=false;
-    shift ;;
-  --static-math)
-    static_math=true;
-    shift ;;
-  --static-math=yes)
-    static_math=true;
-    shift ;;
-  --static-math=no)
-    static_math=false;
-    shift ;;
-  --static-fst)
-    static_fst=true;
-    shift ;;
-  --static-fst=yes)
-    static_fst=true;
-    shift ;;
-  --static-fst=no)
-    static_fst=false;
-    shift ;;
-  --mkl-threading=sequential)
-    threaded_atlas=false;
-    mkl_threading=sequential;
-    shift ;;
-  --mkl-threading=*)
-    mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`;
-    threaded_atlas=true;
-    shift ;;
-  --fst-root=*)
-    FSTROOT=`read_dirname $1`;
-    shift ;;
-  --clapack-root=*)
-    CLAPACKROOT=`read_dirname $1`;
-    shift ;;
-  --openblas-root=*)
-    OPENBLASROOT=`read_dirname $1`;
-    shift ;;
-  --mkl-root=*)
-    MKLROOT=`read_dirname $1`;
-    shift ;;
-  --mkl-libdir=*)
-    MKLLIBDIR=`read_dirname $1`;
-    shift ;;
-  --speex-root=*)
-    SPEEXROOT=`read_dirname $1`;
-    shift ;;
-  --speex-libdir=*)
-    SPEEXLIBDIR=`read_dirname $1`;
-    shift ;;
-  --speex-incdir=*)
-    SPEEXINCLUDEDIR=`read_dirname $1`;
-    shift ;;
-  --omp-libdir=*)
-    OMPLIBDIR=`read_dirname $1`;
-    shift ;;
-  --mathlib=*)
-    MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`;
-    shift ;;
-  --cudatk-dir=*)
-    CUDATKDIR=`read_dirname $1`;
-    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
-  *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
-  esac
-done
-
-# The idea here is that if you change the configuration options from using
-# CUDA to not using it, or vice versa, we want to recompile all parts of the
-# code that may use a GPU. Touching this file is a way to force this.
-touch cudamatrix/cu-common.h 2>/dev/null
-
-# If one of these variables is set, switch the external math library.
-is_set $MKLLIBDIR && echo "Configuring KALDI to use MKL" && export MATHLIB="MKL"
-is_set $MKLROOT && echo "Configuring KALDI to use MKL"&& export MATHLIB="MKL"
-is_set $CLAPACKROOT && echo "Configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
-is_set $OPENBLASROOT && echo "Configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
-
 # MKL functions
 function linux_configure_mkllibdir {
   local mklroot=$1
@@ -508,6 +375,11 @@ function configure_cuda {
     if [ ! -f $CUDATKDIR/bin/nvcc ]; then
       failure "Cannnot find nvcc in CUDATKDIR=$CUDATKDIR"
     fi
+
+    if [[ "$TARGET_ARCH" != "`uname -m`" ]] ; then
+      failure "Cannot cross compile with CUDA support"
+    fi
+
     echo "Using CUDA toolkit $CUDATKDIR (nvcc compiler and runtime libraries)"
     echo >> kaldi.mk
     echo "# CUDA configuration" >> kaldi.mk
@@ -532,7 +404,7 @@ function configure_cuda {
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
     echo >> kaldi.mk
 
-    # 64bit/32bit?
+    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct calls to uname -m here
     if [ "`uname -m`" == "x86_64" ]; then
       if [ "`uname`" == "Darwin" ]; then
         sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
@@ -556,7 +428,7 @@ function linux_configure_speex {
   # Check whether the user has called tools/extras/install_speex.sh or not
   [ ! -z "$SPEEXROOT" ] || SPEEXROOT=`pwd`/../tools/speex
   [ ! -z "$SPEEXLIBDIR" ] || SPEEXLIBDIR="$SPEEXROOT"/lib
-  [ ! -z "$SPEEXINCLUDEDIR" ] || SPEEXINCLUDEDIR="$SPEEXROOT"/include
+  [ ! -z "$SPEEXINCDIR" ] || SPEEXINCDIR="$SPEEXROOT"/include
   static_speex=$1
   if [ "foo"$static_speex == "foo" ]; then
     static_speex=false
@@ -573,9 +445,9 @@ function linux_configure_speex {
     return
   fi
 
-  if [ -f $SPEEXINCLUDEDIR/speex/speex.h ]; then
+  if [ -f $SPEEXINCDIR/speex/speex.h ]; then
     echo >> kaldi.mk
-    echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXINCLUDEDIR} >> kaldi.mk
+    echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXINCDIR} >> kaldi.mk
 
     if $static_speex; then
       echo LDLIBS += $SPEEXLIBDIR/libspeex.a
@@ -594,12 +466,12 @@ function linux_atlas_failure {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
-   cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "`uname -m`" == ppc64le ]]; then
-   cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
+  if [[ "$TARGET_ARCH" == arm* ]]; then
+    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
-   cat makefiles/linux_atlas.mk >> kaldi.mk
+    cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
   echo "** $* ***"
   echo "**  ERROR   **"
@@ -625,7 +497,7 @@ function linux_check_static {
   if [ -f $dir/libatlas.a ]; then # candidate...
     # Note: on the next line, the variable assignment
     # LANG=en_US should apply just to the program called on that line.
-    if LANG=en_US gcc -o test_linking test_linking.cc -u ATL_flushcache $dir/libatlas.a 2>&1 | grep -i "incompatible" >/dev/null; then
+    if LANG=en_US $CXX -o test_linking test_linking.cc -u ATL_flushcache $dir/libatlas.a 2>&1 | grep -i "incompatible" >/dev/null; then
       echo "Directory $dir may contain ATLAS libraries but seems to be wrong architecture";
       rm test_linking test_linking.cc 2>/dev/null
       return 1;
@@ -651,9 +523,9 @@ function linux_configure_debian_ubuntu {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
+  if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-   elif [[ "`uname -m`" == ppc64le ]]; then
+   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -675,9 +547,9 @@ function linux_configure_debian_ubuntu3 {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
+  if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "`uname -m`" == ppc64le ]]; then
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -701,9 +573,9 @@ function linux_configure_debian7 {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
+  if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "`uname -m`" == ppc64le ]]; then
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -724,9 +596,9 @@ function linux_configure_redhat {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
+  if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "`uname -m`" == ppc64le ]]; then
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -749,9 +621,9 @@ function linux_configure_redhat_fat {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
+  if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "`uname -m`" == ppc64le ]]; then
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -806,9 +678,9 @@ function linux_configure_static {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
+  if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "`uname -m`" == ppc64le ]]; then
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -890,9 +762,9 @@ function linux_configure_dynamic {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "`uname -m`" == arm* ]]; then
+  if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "`uname -m`" == ppc64le ]]; then
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
@@ -902,6 +774,234 @@ function linux_configure_dynamic {
   echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
 }
 
+#############################    CONFIGURATION    #############################
+
+# If configuration sets any of these variables, we will switch the external
+# math library. Here we unset them so that we can check later.
+unset MKLROOT
+unset CLAPACKROOT
+unset OPENBLASROOT
+unset MKLLIBDIR
+
+# This variable identifies the type of system where built programs and
+# libraries will run. It is set by the configure script when cross compiling.
+unset HOST
+
+# These environment variables can be used to override the default toolchain.
+CXX=${CXX:-g++}
+AR=${AR:-ar}
+AS=${AS:-as}
+RANLIB=${RANLIB:-ranlib}
+
+# These environment variables can be used to provide additional flags to the
+# compiler/linker. We want these flags to override the flags determined by the
+# configure script, so we append them to the appropriate variables (CXXFLAGS,
+# LDFLAGS and LDLIBS) after those variables are set by the configure script.
+ENV_CXXFLAGS=$CXXFLAGS
+ENV_LDFLAGS=$LDFLAGS
+ENV_LDLIBS=$LDLIBS
+
+# Default configuration
+double_precision=false
+dynamic_kaldi=false
+use_cuda=true
+static_fst=false
+static_math=false
+threaded_atlas=false
+mkl_threading=sequential
+android=false
+
+MATHLIB='ATLAS'
+ATLASROOT=`rel2abs ../tools/ATLAS/`
+FSTROOT=`rel2abs ../tools/openfst`
+
+# Save the command line to include in kaldi.mk
+cmd_line="$0 $@"
+
+while [ $# -gt 0 ];
+do
+  case "$1" in
+  --help)
+    usage; exit 0 ;;
+  --version)
+    echo $CONFIGURE_VERSION; exit 0 ;;
+  --static)
+    dynamic_kaldi=false;
+    static_math=true;
+    static_fst=true;
+    shift ;;
+  --shared)
+    dynamic_kaldi=true;
+    static_math=false;
+    static_fst=false;
+    shift ;;
+  --double-precision)
+    double_precision=true;
+    shift ;;
+  --double-precision=yes)
+    double_precision=true;
+    shift ;;
+  --double-precision=no)
+    double_precision=false;
+    shift ;;
+  --atlas-root=*)
+    ATLASROOT=`read_dirname $1`;
+    shift ;;
+  --threaded-atlas)
+    threaded_atlas=true;
+    shift ;;
+  --threaded-atlas=yes)
+    threaded_atlas=true;
+    shift ;;
+  --threaded-atlas=no)
+    threaded_atlas=false;
+    shift ;;
+  --threaded-math)
+    threaded_atlas=true;
+    mkl_threading=iomp
+    shift ;;
+  --threaded-math=yes)
+    threaded_atlas=true;
+    mkl_threading=iomp
+    shift ;;
+  --threaded-math=no)
+    threaded_atlas=false;
+    mkl_threading=sequential
+    shift ;;
+  --use-cuda)
+    use_cuda=true;
+    shift ;;
+  --use-cuda=yes)
+    use_cuda=true;
+    shift ;;
+  --use-cuda=no)
+    use_cuda=false;
+    shift ;;
+  --static-math)
+    static_math=true;
+    shift ;;
+  --static-math=yes)
+    static_math=true;
+    shift ;;
+  --static-math=no)
+    static_math=false;
+    shift ;;
+  --static-fst)
+    static_fst=true;
+    shift ;;
+  --static-fst=yes)
+    static_fst=true;
+    shift ;;
+  --static-fst=no)
+    static_fst=false;
+    shift ;;
+  --mkl-threading=sequential)
+    threaded_atlas=false;
+    mkl_threading=sequential;
+    shift ;;
+  --mkl-threading=*)
+    mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`;
+    threaded_atlas=true;
+    shift ;;
+  --fst-root=*)
+    FSTROOT=`read_dirname $1`;
+    shift ;;
+  --clapack-root=*)
+    CLAPACKROOT=`read_dirname $1`;
+    shift ;;
+  --openblas-root=*)
+    OPENBLASROOT=`read_dirname $1`;
+    shift ;;
+  --mkl-root=*)
+    MKLROOT=`read_dirname $1`;
+    shift ;;
+  --mkl-libdir=*)
+    MKLLIBDIR=`read_dirname $1`;
+    shift ;;
+  --speex-root=*)
+    SPEEXROOT=`read_dirname $1`;
+    shift ;;
+  --speex-libdir=*)
+    SPEEXLIBDIR=`read_dirname $1`;
+    shift ;;
+  --speex-incdir=*)
+    SPEEXINCDIR=`read_dirname $1`;
+    shift ;;
+  --omp-libdir=*)
+    OMPLIBDIR=`read_dirname $1`;
+    shift ;;
+  --mathlib=*)
+    MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`;
+    shift ;;
+  --cudatk-dir=*)
+    CUDATKDIR=`read_dirname $1`;
+    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
+  --fst-version=*)
+    OPENFST_VER=`expr "X$1" : '[^=]*=\(.*\)'`;
+    shift;;
+  --host=*)
+    # The type of system where built programs and libraries will run.
+    # It should be in the format cpu-vendor-os. If specified, this script
+    # will infer the target architecture from the specified host triple.
+    HOST=`expr "X$1" : '[^=]*=\(.*\)'`;
+    shift ;;
+  --android-incdir=*)
+    android=true;
+    threaded_math=false;
+    static_math=true;
+    static_fst=true;
+    dynamic_kaldi=false;
+    MATHLIB='OPENBLAS';
+    ANDROIDINCDIR=`read_dirname $1`;
+    shift;;
+  *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
+  esac
+done
+
+# The idea here is that if you change the configuration options from using
+# CUDA to not using it, or vice versa, we want to recompile all parts of the
+# code that may use a GPU. Touching this file is a way to force this.
+touch cudamatrix/cu-common.h 2>/dev/null
+
+if $android && [[ "$CXX" != *clang++*  ]] ; then
+  failure "Android build requires clang++. Make sure you have clang++ installed
+  on your system and then override the default compiler by setting CXX, e.g.
+  CXX=clang++ ./configure"
+fi
+
+# If HOST is set
+# 1. We prepend it to CXX, AR, AS and RANLIB.
+# 2. We parse the target architecture from the HOST triple.
+# Otherwise we set the target architecture to the output of `uname -m`.
+if is_set $HOST; then
+  CXX="$HOST-$CXX"
+  AR="$HOST-$AR"
+  AS="$HOST-$AS"
+  RANLIB="$HOST-$RANLIB"
+
+  # The host triple will be something like "armv8-rpi3-linux-gnueabihf". We
+  # need the first field which is the target architecture for this build. The
+  # following command will take the host triple "armv8-rpi3-linux-gnueabihf"
+  # and return ["armv8", "rpi3", "linux", "gnueabihf"] in PARTS.
+  IFS='-' read -ra PARTS <<< "$HOST"
+  # The first field in the PARTS list is the target architecture.
+  TARGET_ARCH="$PARTS"
+  if [[ "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && "$TARGET_ARCH" != x86* ]] ; then
+    # We currently only support building for x86[_64], arm*, and ppc64le.
+    # If TARGET_ARCH was read from the HOST variable, it must be one of these.
+    failure "$TARGET_ARCH is not a supported architecture.
+             Supported architectures: x86[_64], arm*, ppc64le."
+  fi
+else
+  TARGET_ARCH="`uname -m`"
+fi
+
+# If one of these variables is set, we switch the external math library.
+is_set $MKLLIBDIR && echo "Configuring KALDI to use MKL" && export MATHLIB="MKL"
+is_set $MKLROOT && echo "Configuring KALDI to use MKL"&& export MATHLIB="MKL"
+is_set $CLAPACKROOT && echo "Configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
+is_set $OPENBLASROOT && echo "Configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
+
 echo "Configuring ..."
 
 # Back up the old kaldi.mk in case we modified it
@@ -910,13 +1010,24 @@ if [ -f kaldi.mk ]; then
   cp kaldi.mk kaldi.mk.bak
 fi
 
-echo "Checking compiler $CXX ..."
-check_compiler $CXX
-
-printf "# This file was generated using the following command:\n# $cmd_line\n\n" > kaldi.mk
+# Generate the new kaldi.mk file
+echo "# This file was generated using the following command:" > kaldi.mk
+echo "# $cmd_line" >> kaldi.mk
+echo >> kaldi.mk
 echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
 echo >> kaldi.mk
 
+echo "# Toolchain configuration" >> kaldi.mk
+echo >> kaldi.mk
+echo "CXX = $CXX" >> kaldi.mk
+echo "AR = $AR" >> kaldi.mk
+echo "AS = $AS" >> kaldi.mk
+echo "RANLIB = $RANLIB" >> kaldi.mk
+echo >> kaldi.mk
+
+echo "Checking compiler $CXX ..."
+check_compiler $CXX
+
 echo "# Base configuration" >> kaldi.mk
 echo >> kaldi.mk
 if $dynamic_kaldi ; then
@@ -934,13 +1045,13 @@ if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
   failure "Could not find file $FSTROOT/include/fst/fst.h:
   you may not have installed OpenFst. See ../tools/INSTALL"
 fi
-OPENFST_VER=$(grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::')
+OPENFST_VER=${OPENFST_VER:-$(grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::')}
 OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 if [ $OPENFST_VER_NUM -lt 10600 ]; then
   failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)"
 fi
 echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk
-if  $static_fst ; then
+if $static_fst ; then
   OPENFSTLIBS="$FSTROOT/lib/libfst.a"
 else
   if [ "`uname`" == "Darwin"  ]; then
@@ -950,7 +1061,7 @@ else
     OPENFSTLIBS="$FSTROOT/lib/libfst.so"
     OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
   else
-    failure "Dynamic libraries not supported on this platform.
+    failure "Dynamic libraries are not supported on this platform.
              Run configure with --static --static-fst=no flag."
   fi
 fi
@@ -959,16 +1070,43 @@ if [ ! -f "$OPENFSTLIBS" ]; then
 fi
 echo "OPENFSTLIBS = $OPENFSTLIBS" >> kaldi.mk
 echo "OPENFSTLDFLAGS = $OPENFSTLDFLAGS" >> kaldi.mk
-echo "CXX = $CXX" >> kaldi.mk
 echo >> kaldi.mk
 
-# Most of the OS-specific steps below will append to kaldi.mk
+# OS-specific steps given below append to kaldi.mk
 echo "Doing OS specific configurations ..."
 
-# Check for Darwin at first, because we later call uname -o (for Cygwin)
-# which crashes on Darwin. Also the linear algebra libraries on Macs are
-# used differently (through the Accelerate framework) than on Linux.
-if [ "`uname`" == "Darwin"  ]; then
+if $android ; then
+  if [ -z $ANDROIDINCDIR ] ;  then
+    failure "--android-incdir must be specified for android builds."
+  fi
+
+  if ! is_set $HOST; then
+    failure "HOST must be specified for android builds."
+  fi
+
+  OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
+  if [ -z "$OPENBLASROOT" ]; then
+    failure "The location of OPENBLAS must be specified for android builds
+             using --openblas-root (and it must exist)"
+  fi
+  if [ ! -f $OPENBLASROOT/lib/libopenblas.a ]; then
+    failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.a"
+  fi
+  echo "Using OpenBLAS as the linear algebra library."
+
+  OPENBLASLIBS="$OPENBLASROOT/lib/libopenblas.a $OPENBLASROOT/lib/libclapack.a $OPENBLASROOT/lib/liblapack.a $OPENBLASROOT/lib/libblas.a $OPENBLASROOT/lib/libf2c.a"
+  echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
+  echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
+  echo "ANDROIDINCDIR = $ANDROIDINCDIR" >> kaldi.mk
+
+  cat makefiles/android_openblas.mk >> kaldi.mk
+
+  echo "Successfully configured for Android with OpenBLAS from $OPENBLASROOT."
+
+elif [ "`uname`" == "Darwin" ]; then
+  # Check for Darwin first, because we later call uname -o (for Cygwin)
+  # which crashes on Darwin.
+
   echo "On Darwin: Checking for Accelerate framework ..."
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
     failure "Need the Accelerate framework to compile on Darwin."
@@ -1054,7 +1192,7 @@ elif [ "`uname`" == "Linux" ]; then
     fi
 
   elif [ "$MATHLIB" == "MKL" ]; then
-    if [ "`uname -m`" != "x86_64" ]; then
+    if [ "$TARGET_ARCH" != "x86_64" ]; then
       failure "MKL on Linux only supported for Intel(R) 64 architecture (x86_64).
       See makefiles/linux_64_mkl.mk to manually configure for other platforms."
     fi
@@ -1118,7 +1256,7 @@ elif [ "`uname`" == "Linux" ]; then
     if [ ! -f makefiles/linux_clapack.mk ]; then
       failure "makefiles/linux_clapack.mk not found."
     fi
-    if [[ "`uname -m`" == arm* ]]; then
+    if [[ "$TARGET_ARCH" == arm* ]]; then
       cat makefiles/linux_clapack_arm.mk >> kaldi.mk
     else
       cat makefiles/linux_clapack.mk >> kaldi.mk
@@ -1147,9 +1285,9 @@ elif [ "`uname`" == "Linux" ]; then
     echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
     echo >> kaldi.mk
-    if [[ "`uname -m`" == arm* ]]; then
+    if [[ "$TARGET_ARCH" == arm* ]]; then
       cat makefiles/linux_openblas_arm.mk >> kaldi.mk
-    elif [[ "`uname -m`" == ppc64le ]]; then
+    elif [[ "$TARGET_ARCH" == ppc64le ]]; then
       cat makefiles/linux_openblas_ppc64le.mk >> kaldi.mk
     else
       cat makefiles/linux_openblas.mk >> kaldi.mk
@@ -1163,8 +1301,7 @@ elif [ "`uname`" == "Linux" ]; then
   fi
 else
   failure "Could not detect the platform or we have not yet worked out the
-           appropriate configuration for this platform.
-           Please contact the developers."
+  appropriate configuration for this platform. Please contact the developers."
 fi
 
 # Append the flags set by environment variables last so they can be used
diff --git a/src/makefiles/android_openblas.mk b/src/makefiles/android_openblas.mk
index f628c0400a1..c8f60f4fa4f 100644
--- a/src/makefiles/android_openblas.mk
+++ b/src/makefiles/android_openblas.mk
@@ -1,64 +1,42 @@
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
+# OpenBLAS specific Android configuration
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
 ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
-
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
+endif
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
-
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
-endif
-
 ifndef ANDROIDINC
 $(error ANDROIDINC not defined.)
 endif
 
- CXXFLAGS += -mhard-float -D_NDK_MATH_NO_SOFTFP=1  -Wall -I.. \
-      -pthread -mfpu=neon -ftree-vectorize -mfloat-abi=hard \
-      -DHAVE_OPENBLAS -DANDROID_BUILD -I $(OPENBLASROOT)/include \
-      -I$(ANDROIDINC) \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
-      -Wno-sign-compare -Winit-self \
-       -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifneq ($(findstring clang,$(COMPILER)),clang)
+$(error Android build does not support compiling with $(CXX).
+        Supported compilers: clang++)
 endif
 
-LDFLAGS = -Wl,--no-warn-mismatch -pie
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -ldl -lm_hard
-
-CC = clang++
-CXX = clang++
-AR = ar
-AS = as
-RANLIB = ranlib
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-endif
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self -Wno-mismatched-tags \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -DANDROID_BUILD \
+           -I$(OPENBLASINC) -I$(ANDROIDINC) -ftree-vectorize -mfloat-abi=hard \
+           -mfpu=neon -mhard-float -D_NDK_MATH_NO_SOFTFP=1 -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
 endif
 
-
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -Wl,--no-warn-mismatch -pie
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm_hard -ldl
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index e5657818ce5..c58cd3a42da 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -26,7 +26,3 @@ LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g \
           --enable-auto-import -L/usr/lib/lapack
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -lcyglapack-0 -lcygblas-0 \
          -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 24fbdca890f..dffcc878083 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -25,10 +25,6 @@ endif
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
 
-AR = ar
-AS = as
-RANLIB = ranlib
-
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 929461831df..b30c7ad5474 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -30,7 +30,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 9b9c42257fb..35e98da51d7 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -30,7 +30,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index a0c22927f2e..a5962f7964b 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -31,7 +31,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 95c58d0ec22..87e016aae5b 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -24,7 +24,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 2b15193046b..d21e640d3c1 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -24,7 +24,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index b7b74bff89a..d145c687438 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -30,7 +30,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 344879580aa..29a91752509 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -30,7 +30,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 9225f4922f0..6550d915c6c 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -31,7 +31,3 @@ endif
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 595557a5ef4..50b4047def7 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -58,7 +58,3 @@ MKL_DYN_MUL = -L$(MKLLIB) -lmkl_solver_lp64 -Wl,--start-group -lmkl_intel_lp64 \
 
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(MKLFLAGS) -lm -lpthread -ldl
-
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/windows/get_version.pl b/windows/get_version.pl
index f66a3a23c25..2a54891516a 100755
--- a/windows/get_version.pl
+++ b/windows/get_version.pl
@@ -39,6 +39,6 @@
 };
 
 my $kaldi_ver=<$F>; chomp $kaldi_ver;
-print $H  "#define KALDI_VERSION \"${kaldi_ver}-win\"\n";
+print $H  "KALDI_VERSION=${kaldi_ver}-win\n";
 close($F);
 close($H);

From e4474a89dba7c7bf65d55f97e8f406a31d13b927 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Thu, 9 Feb 2017 14:49:52 -0500
Subject: [PATCH 141/213] [scripts,egs] ivector compatibility checks; minor
 fixes in egs (#1395)

---
 egs/rm/s5/local/chain/run_tdnn_5g.sh          |  2 +-
 egs/rm/s5/local/chain/run_tdnn_5n.sh          |  5 +-
 egs/wsj/s5/steps/libs/common.py               |  6 +++
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 47 ++++++++++++++-----
 .../steps/nnet2/check_ivectors_compatible.sh  | 40 ++++++++++++++++
 egs/wsj/s5/steps/nnet2/get_ivector_id.sh      | 42 +++++++++++++++++
 egs/wsj/s5/steps/nnet3/align.sh               |  7 ++-
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh       |  1 +
 egs/wsj/s5/steps/nnet3/chain/train.py         |  5 +-
 egs/wsj/s5/steps/nnet3/decode.sh              |  7 ++-
 egs/wsj/s5/steps/nnet3/get_egs.sh             |  1 +
 .../s5/steps/nnet3/get_egs_discriminative.sh  |  1 +
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh     |  3 +-
 egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh |  6 +++
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  4 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  4 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  7 ++-
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  4 +-
 .../s5/steps/online/nnet2/extract_ivectors.sh |  5 +-
 .../online/nnet2/extract_ivectors_online.sh   |  5 ++
 .../online/nnet2/train_ivector_extractor.sh   |  5 ++
 21 files changed, 181 insertions(+), 26 deletions(-)
 create mode 100755 egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
 create mode 100755 egs/wsj/s5/steps/nnet2/get_ivector_id.sh

diff --git a/egs/rm/s5/local/chain/run_tdnn_5g.sh b/egs/rm/s5/local/chain/run_tdnn_5g.sh
index f6fbe070763..088cb3ec778 100755
--- a/egs/rm/s5/local/chain/run_tdnn_5g.sh
+++ b/egs/rm/s5/local/chain/run_tdnn_5g.sh
@@ -120,7 +120,7 @@ if [ $stage -le 8 ]; then
     --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
     --trainer.optimization.final-effective-lrate $final_effective_lrate \
     --trainer.max-param-change $max_param_change \
-    --cleanup.remove-egs true \
+    --cleanup.remove-egs $remove_egs \
     --feat-dir data/train \
     --tree-dir $treedir \
     --lat-dir exp/tri3b_lats \
diff --git a/egs/rm/s5/local/chain/run_tdnn_5n.sh b/egs/rm/s5/local/chain/run_tdnn_5n.sh
index 7fd7b82aa1d..7a08becd57f 100755
--- a/egs/rm/s5/local/chain/run_tdnn_5n.sh
+++ b/egs/rm/s5/local/chain/run_tdnn_5n.sh
@@ -25,7 +25,8 @@ num_jobs_final=4
 minibatch_size=128
 frames_per_eg=150
 remove_egs=false
-
+#common_egs_dir=exp/chain/tdnn_5g/egs/
+common_egs_dir=
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -121,7 +122,7 @@ if [ $stage -le 8 ]; then
     --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
     --trainer.optimization.final-effective-lrate $final_effective_lrate \
     --trainer.max-param-change $max_param_change \
-    --cleanup.remove-egs true \
+    --cleanup.remove-egs $remove_egs \
     --feat-dir data/train \
     --tree-dir $treedir \
     --lat-dir exp/tri3b_lats \
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index 1e0608525ba..66a02062e9c 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -294,6 +294,12 @@ def get_ivector_dim(ivector_dir=None):
     ivector_dim = int(stdout_val)
     return ivector_dim
 
+def get_ivector_extractor_id(ivector_dir=None):
+    if ivector_dir is None:
+        return None
+    [stdout_val, stderr_val] = run_kaldi_command(
+        "steps/nnet2/get_ivector_id.sh {dir}".format(dir=ivector_dir))
+    return stdout_val.strip()
 
 def get_feat_dim(feat_dir):
     [stdout_val, stderr_val] = run_kaldi_command(
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 5e328ad1894..6d212bc5d49 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -55,7 +55,7 @@ def get_successful_models(num_models, log_file_pattern,
         for line_num in range(1, len(lines) + 1):
             # we search from the end as this would result in
             # lesser number of regex searches. Python regex is slow !
-            mat_obj = parse_regex.search(lines[-1*line_num])
+            mat_obj = parse_regex.search(lines[-1 * line_num])
             if mat_obj is not None:
                 this_objf = float(mat_obj.groups()[0])
                 break
@@ -64,7 +64,7 @@ def get_successful_models(num_models, log_file_pattern,
     accepted_models = []
     for i in range(num_models):
         if (objf[max_index] - objf[i]) <= difference_threshold:
-            accepted_models.append(i+1)
+            accepted_models.append(i + 1)
 
     if len(accepted_models) != num_models:
         logger.warn("Only {0}/{1} of the models have been accepted "
@@ -72,7 +72,7 @@ def get_successful_models(num_models, log_file_pattern,
                         len(accepted_models),
                         num_models, log_file_pattern))
 
-    return [accepted_models, max_index+1]
+    return [accepted_models, max_index + 1]
 
 
 def get_average_nnet_model(dir, iter, nnets_list, run_opts,
@@ -141,7 +141,7 @@ def validate_chunk_width(chunk_width):
     or a comma-separated list of integers like '20,30,16'"""
     if not isinstance(chunk_width, str):
         return False
-    a = chunk_width.split(",");
+    a = chunk_width.split(",")
     assert len(a) != 0  # would be code error
     for elem in a:
         try:
@@ -173,7 +173,7 @@ def validate_range_str(range_str):
     for r in ranges:
         # a range may be either e.g. '64', or '128-256'
         try:
-            c = [ int(x) for x in r.split(":") ]
+            c = [int(x) for x in r.split(":")]
         except:
             return False
         # c should be either e.g. [ 128 ], or  [64,128].
@@ -188,7 +188,6 @@ def validate_range_str(range_str):
     return True
 
 
-
 def validate_minibatch_size_str(minibatch_size_str):
     """Validate a minibatch-size string (returns bool).
     A minibatch-size string might either be an integer, like '256',
@@ -240,7 +239,7 @@ def halve_range_str(range_str):
     halved_ranges = []
     for r in ranges:
         # a range may be either e.g. '64', or '128:256'
-        c = [ str(max(1, int(x)/2)) for x in r.split(":") ]
+        c = [str(max(1, int(x)/2)) for x in r.split(":")]
         halved_ranges.append(":".join(c))
     return ','.join(halved_ranges)
 
@@ -269,7 +268,7 @@ def halve_minibatch_size_str(minibatch_size_str):
 
 def copy_egs_properties_to_exp_dir(egs_dir, dir):
     try:
-        for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
+        for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat']:
             file_name = '{dir}/{file}'.format(dir=egs_dir, file=file)
             if os.path.isfile(file_name):
                 shutil.copy2(file_name, dir)
@@ -302,12 +301,23 @@ def parse_generic_config_vars_file(var_file):
     raise Exception('Error while parsing the file {0}'.format(var_file))
 
 
-def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id,
                    left_context, right_context,
                    left_context_initial=-1, right_context_final=-1):
     try:
         egs_feat_dim = int(open('{0}/info/feat_dim'.format(
                                     egs_dir)).readline())
+
+        egs_ivector_id = None
+        try:
+            egs_ivector_id = open('{0}/info/final.ie.id'.format(
+                                        egs_dir)).readline().strip()
+        except:
+            # it could actually happen that the file is not there
+            # for example in cases where the egs were dumped by
+            # an older version of the script
+            pass
+
         egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(
                                     egs_dir)).readline())
         egs_left_context = int(open('{0}/info/left_context'.format(
@@ -330,12 +340,26 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                             "the current experiment and the provided "
                             "egs directory")
 
+        if (((egs_ivector_id is None) and (ivector_extractor_id is not None)) or
+            ((egs_ivector_id is not None) and (ivector_extractor_id is None))):
+            logger.warning("The ivector ids are inconsistently used. It's your "
+                          "responsibility to make sure the ivector extractor "
+                          "has been used consistently")
+        elif (((egs_ivector_id is None) and (ivector_extractor_id is None))):
+            logger.warning("The ivector ids are not used. It's your "
+                          "responsibility to make sure the ivector extractor "
+                          "has been used consistently")
+        elif (ivector_extractor_id != egs_ivector_id):
+            raise Exception("The egs were generated using a different ivector "
+                            "extractor. id1 = {0}, id2={1}".format(
+                                ivector_extractor_id, egs_ivector_id));
+
         if (egs_left_context < left_context or
                 egs_right_context < right_context):
             raise Exception('The egs have insufficient (l,r) context ({0},{1}) '
                             'versus expected ({2},{3})'.format(
-                    egs_left_context, egs_right_context,
-                    left_context, right_context))
+                            egs_left_context, egs_right_context,
+                            left_context, right_context))
 
         # the condition on the initial/final context is an equality condition,
         # not an inequality condition, as there is no mechanism to 'correct' the
@@ -565,6 +589,7 @@ def self_test():
     assert validate_chunk_width('64')
     assert validate_chunk_width('64,25,128')
 
+
 class CommonParser:
     """Parser for parsing common options related to nnet3 training.
 
diff --git a/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
new file mode 100755
index 00000000000..40cc0d2c349
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+
+#echo >&2 "$0 $@"  # Print the command line for logging
+if [ $# != 2 ] ; then
+  echo >$2 "Usage: $0  <first-dir> <second-dir>"
+  echo >$2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem"
+fi
+
+dir_a=$1
+dir_b=$2
+
+id_a=$(steps/nnet2/get_ivector_id.sh $dir_a)
+ret_a=$?
+id_b=$(steps/nnet2/get_ivector_id.sh $dir_b)
+ret_b=$?
+
+if [ ! -z "$id_a" ] && [ ! -z "${id_b}" ] ; then
+  if [ "${id_a}" == "${id_b}" ]; then
+    exit 0
+  else
+    echo >&2 "$0: ERROR: iVector id in ${id_a} and the iVector id in ${id_a} do not match"
+    echo >&2 "$0: ERROR: that means that the systems are not compatible."
+    exit 1
+  fi
+elif [ -z "$id_a" ] && [ -z "${id_b}" ] ; then
+    echo >&2 "$0: WARNING: The directories do not contain iVector ID."
+    echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
+    echo >&2 "$0: WARNING: the directories compatible"
+    exit 0
+else
+    echo >&2 "$0: WARNING: One of the directories do not contain iVector ID."
+    echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
+    echo >&2 "$0: WARNING: the directories compatible"
+    exit 0
+fi
diff --git a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
new file mode 100755
index 00000000000..d7be853349d
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+# End configuration section.
+
+#echo >&2 "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 1 ]; then
+  echo >$2 "Usage: $0 <directory>"
+  echo >$2 " e.g.: $0 exp/nnet3/extractor"
+  exit 1
+fi
+
+ivecdir=$1
+
+if [ -f $ivecdir/final.ie.id ] ; then
+  cat $ivecdir/final.ie.id
+elif [ -f $ivecdir/final.ie ] ; then
+  # note the creation can fail in case the extractor directory
+  # is not read-only media or the user des not have access rights
+  # in that case we will just behave as if the id is not available
+  id=$(md5sum $ivecdir/final.ie | awk '{print $1}')
+  echo "$id" > $ivecdir/final.ie.id || exit 1
+  cat $ivecdir/final.ie.id
+else
+  exit 1
+fi
+
+exit 0
+
+
+
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index 4c3b0987562..1ae5218aa85 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -62,8 +62,11 @@ else
 fi
 
 extra_files=
-[ ! -z "$online_ivector_dir" ] && \
-  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$srcdir/final.ie.id $online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
 for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 0b1ddd1fbc7..4a61f8edaa7 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -215,6 +215,7 @@ fi
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
   ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 8624dc947b9..19276817ea0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -265,6 +265,7 @@ def train(args, run_opts, background_process_handler):
     num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
@@ -357,7 +358,8 @@ def train(args, run_opts, background_process_handler):
 
     [egs_left_context, egs_right_context,
      frames_per_eg_str, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
                                         egs_left_context, egs_right_context,
                                         egs_left_context_initial,
                                         egs_right_context_final))
@@ -370,6 +372,7 @@ def train(args, run_opts, background_process_handler):
 
     # copy the properties of the egs to dir for
     # use during decoding
+    logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir))
     common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
     if (args.stage <= -2):
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index b97e7f415d7..8aa86e92dcb 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -65,8 +65,11 @@ srcdir=`dirname $dir`; # Assume model directory one level up from decoding direc
 model=$srcdir/$iter.mdl
 
 
-[ ! -z "$online_ivector_dir" ] && \
-  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$srcdir/final.ie.id $online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
 
 for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index c47522fec7a..d72a3d23fe5 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -188,6 +188,7 @@ fi
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
   ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index 377c49fc5cb..f74b66b5fd2 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -207,6 +207,7 @@ if [ ! -z $online_ivector_dir ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period)
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim >$dir/info/ivector_dim
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
   ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
   ivector_opts=""
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 4af10e2dde1..a2749b48fac 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -179,7 +179,8 @@ if [ -f $dir/trans.scp ]; then
 fi
 
 if [ ! -z "$online_ivector_dir" ]; then
-  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
   ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
index 8fce9ae3831..8f3dac45315 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
@@ -141,6 +141,7 @@ if [ -z "$online_ivector_dir" ]; then
   ivector_dim=0
 else
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/final.ie.id || exit 1
 fi
 
 if [ ! -z "$configs_dir" ]; then
@@ -213,6 +214,11 @@ fi
 
 [ -z $egs_dir ] && egs_dir=$dir/egs
 
+if [ ! -z "$online_ivector_dir" ] ; then
+  steps/nnet2/check_ivectors_compatible.sh $online_ivector_dir $egs_dir/info || exit 1
+fi
+
+
 if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
   echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
   exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index c400442d429..f8a4cb6c861 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -165,6 +165,7 @@ def train(args, run_opts, background_process_handler):
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
@@ -231,7 +232,8 @@ def train(args, run_opts, background_process_handler):
 
     [egs_left_context, egs_right_context,
      frames_per_eg_str, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
                                         left_context, right_context))
     assert(str(args.frames_per_eg) == frames_per_eg_str)
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 0264b409e46..e65a690101a 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -164,6 +164,7 @@ def train(args, run_opts, background_process_handler):
     # Set some variables.
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
@@ -246,7 +247,8 @@ def train(args, run_opts, background_process_handler):
 
     [egs_left_context, egs_right_context,
      frames_per_eg_str, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
                                         left_context, right_context))
     assert(str(args.frames_per_eg) == frames_per_eg_str)
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 5a96d6020fa..272485b898a 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -221,6 +221,8 @@ def train(args, run_opts, background_process_handler):
     # Set some variables.
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
+
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
@@ -310,11 +312,12 @@ def train(args, run_opts, background_process_handler):
 
     [egs_left_context, egs_right_context,
      frames_per_eg_str, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim,
+                                        ivector_dim, ivector_id,
                                         left_context, right_context))
     if args.chunk_width != frames_per_eg_str:
         raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
-                        "in the egs dir {0} vs {1}".(args.chunk_width,
+                        "in the egs dir {0} vs {1}".format(args.chunk_width,
                                                      frames_per_eg_str))
 
     if (args.num_jobs_final > num_archives):
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 5824a77dbfe..6636513e03d 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -221,6 +221,7 @@ def train(args, run_opts, background_process_handler):
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
@@ -295,7 +296,8 @@ def train(args, run_opts, background_process_handler):
 
     [egs_left_context, egs_right_context,
      frames_per_eg_str, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
                                         left_context, right_context,
                                         left_context_initial, right_context_final))
     if args.chunk_width != frames_per_eg_str:
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
index f27baecd673..53026b840bd 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
@@ -277,4 +277,7 @@ if [ $stage -le 5 ]; then
   for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
 fi
 
-echo "$0: done extracting (pseudo-online) iVectors"
+steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1
+
+echo "$0: done extracting (pseudo-online) iVectors to $dir using the extractor in $srcdir."
+
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
index b52de1f516b..f4d908e9446 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
@@ -127,3 +127,8 @@ if [ $stage -le 1 ]; then
   echo "$0: combining iVectors across jobs"
   for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
 fi
+
+steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1
+
+echo "$0: done extracting (online) iVectors to $dir using the extractor in $srcdir."
+
diff --git a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
index 9b354c0753e..67845b01c8a 100755
--- a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
+++ b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
@@ -184,3 +184,8 @@ done
 
 rm $dir/final.ie 2>/dev/null
 ln -s $x.ie $dir/final.ie
+
+# assign a unique id to this extractor
+# we are not interested in the id itself, just pre-caching ...
+steps/nnet2/get_ivector_id.sh $dir > /dev/null || exit 1
+

From f81bc7a36501235150ca8855bc1b445f50a4ce4a Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Thu, 9 Feb 2017 15:08:00 -0500
Subject: [PATCH 142/213] [build] make the Makefile checks serial (#1409)

---
 src/Makefile | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index fded748fbe5..c3346d873ef 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -31,9 +31,15 @@ include kaldi.mk
 
 # Reset the default goal, so that the all target will become default
 .DEFAULT_GOAL :=
-all: checkversion kaldi.mk mklibdir $(SUBDIRS)
+all: 
+	$(MAKE) checkversion 
+	$(MAKE) kaldi.mk 
+	$(MAKE) mklibdir
+	$(MAKE) subdirs
 	-echo Done
 
+subdirs: $(SUBDIRS)
+
 mklibdir:
 	test -d $(KALDILIBDIR) || mkdir $(KALDILIBDIR)
 
@@ -51,8 +57,10 @@ checkversion:
 ifeq ($(shell ./configure --version),$(CONFIGURE_VERSION))
 	@echo "The version of configure script matches kaldi.mk version. Good."
 else
+	@echo ""
 	@echo "The kaldi.mk file was generated using a different version of configure script. Please rerun the configure again"
 	@test -f ./kaldi.mk && echo  "Hint: Previous configure command line: " && head -n 2 ./kaldi.mk | grep configure | sed 's/^# *//g'
+	@echo ""
 	@false
 endif
 

From a8cce104b3436d0e52f8da2cab234d26eab6ae9c Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Thu, 9 Feb 2017 12:08:50 -0800
Subject: [PATCH 143/213] Resolve merge conflicts and add "make ext" to travis
 build (#1407)

* [build]: resolving OpenFst compilation issue with  gcc-6.x (#1392)

* [egs] Add new graphemic system for Gale Arabic, with newer nnet scripts (#1298)

* [build] Windows build: generate missing base/version.h; cosmetic changes (#1397)

* [build]: Enable cross compilation, including to android. (#726)

If a user has a number of tool chains installed and they do not want to
use the default, they must currently edit the kaldi.mk file after
running configure to change the CC, CXX, AR, AS, and RANLIB variables.
This is something that should be exposed via the configure script. This
patch exposes an option to set the host triple for the desired tool
chain in the configure script.

Building Kaldi on my Raspberry Pi boards is not particularly fast.  I
have been using the following patch to build kaldi executables for use
on the Pi boards for the better part of a year.  A typical invocation
for me is something like:

$ ./configure --static --atlas-root=/opt/cross/armv8hf \
--fst-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf \
--fst-version=1.4.1

This way I can build on my much faster x86 desktop, but still run
experiments on ARM.

I have included support for cross compiling for ppc64le and it works for
me (at least it produces binaries for ppc64le I don't have a ppc64
machine to test it).

Signed-off-by: Eric B Munson <eric@cobaltspeech.com>

* Add mk file and configure options for building for Android

Building for Android requires a toolchain that can be built using the
Android NDK.  It works similiarly to the linux build except that it only
uses clang, only supports the openBLAS math library, and requires an
additional include directory for the system C++ headers.

A typical configure invocation looks like:

./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
--fst-root=/opt/cross/arm-linux-androideabi \
--host=arm-linux-androideabi --fst-version=1.4.1 \
--android-includes=/opt/cross/arm-linux-androideabi/sysroot/usr/include

Signed-off-by: Eric B Munson <eric@cobaltspeech.com>

* Make pthread cancel symbols noops for Android

The Android C library does not support cancelling pthreads so the
symbols PTHREAD_CANCEL_STATE and pthread_setcancelstate are undefined.
Because a pthread cannot be cancelled in Android, it is reasonable to
make the pthread_setcancelstate() call a noop.

Signed-off-by: Eric B Munson <eric@cobaltspeech.com>

* [build] fixing issue introduced in the previous win commit (#1399)

* [egs] Fix to HKUST nnet2/3 scripts. (#1401)

when training ubm, we should just use the 40 dimention mfcc
so change the train directory for avoiding dimention mismatching
this script won't get error when run after nnet2's scripts.

* [egs,scripts,src] Add BABEL s5d recipe; various associated fixes (#1356)

* Creating a new recipe directory

* adding lists

* Improvements in the pipeline, fixes, syllab search

* Transplanting the diff to s5d

* added TDNN, LSTM and BLSTM scripts.
added Telugu conf files.

* added blstm script and top level commands

* improved keyword search, new lang  configs

* removing not needed scripts

* added blstm results

* some keyword-search optimization binaries

* removing some extra files + kwsearch pipeline improvement

* adding configs for the OP3 langs

* configs for the rest of the OP3 langs

* Added updated configs for IndusDB.20151208.Babel.tar.bz2

* fixes of the pipeline, added langp (re)estimation

* adding the kaldi-native search pipeline and a bunch of changes related to this

* removing extra files

* A couple of fixes

* KWS improvements and fixes

* Fixes of a couple of issues reported by Fred Richardson <frichard@ll.mit.edu>

* A separate script for lexicon expansion

* A couple of fixes and tweaks. Added checks for tools, especially sox.

* adding a couple of changes -- new style options and results for BP langs

* adding new results(still will need to be updated)

* added langp and some details tweaked

* updated STT results, new KWS results and a couple of small fixes all around

* adding file lists for dev languages

* miniature fixes and cleanups

* one more batch of small fixes -- mostly whitespace cleanup

* small fixes -- location of files and removal of trailing slash inn the pathname

* enabling stage-2 KWS pipeline

* adding some directories to .gitignore

* some quick fixes

* latest fixes

* making the script split_compound_set to conform to the naming

* some last minute fixes for the combination scoring

* do not attempt to score when the scoring data is not available

* bug fixes and --ntrue-from option

* another batch of fixes

* adding +x permission to split_compound_set.sh

* fixing whitespaces

* fixing whitespaces

* a couple of fixes

* adding the cleanup script and chain models training

* adding the graphemic/unicode lexicon feature

* adding the graphemic/unicode lexicon feature

* fixing the the cc files headers, adding c info

* use the user-provided kwset id, not the filename

* use _cleaned affix

* fixes w.r.t. getting chain models independent on other systems

* small fixes as reported by Fred Richardson and Yenda

* another issue reported by Fred Richarson

* fixing KWS for the chain systems

* fixes in the KWS hitlist combination

* adding 40hrs pashto config and fixes for the unicode system

* fixing some bugs as reported by Ni Chongjia (I2R)

* fixing some bugs as reported by Fred Richardson

* adding 40hrs Pashto OP3 setup

* addressing Dan's comments, some further cleanup

* improving the make_index script

* remove  fsts-scale

* adding 'see also' to some of the fst tools

* adding back accidentaly removed svn check

* [egs] removing empty files in BABEL recipe (#1406)

These caused a problem on MacOS, as reported by @dogancan.

* Add online extension to travis build.

* Fix parallel online extension build. Randomly choose between single and double precision BaseFloats in travis build.

* Remove parantheses that were unintentinally added to the travis script in the previous commit.
---
 src/Makefile                   |  2 +-
 src/nnet/nnet-various.h        |  4 ++--
 src/nnet3/nnet-example-utils.h |  2 +-
 tools/extras/travis_script.sh  | 10 +++++++++-
 windows/get_version.pl         |  6 +++---
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index c3346d873ef..52b23261b76 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -142,7 +142,7 @@ $(SUBDIRS) : mklibdir
 	$(MAKE) -C $@
 
 .PHONY: $(EXT_SUBDIRS)
-$(EXT_SUBDIRS) : mklibdir
+$(EXT_SUBDIRS) : mklibdir ext_depend
 	$(MAKE) -C $@
 
 
diff --git a/src/nnet/nnet-various.h b/src/nnet/nnet-various.h
index ddd370eaeff..eeef9bc25bf 100644
--- a/src/nnet/nnet-various.h
+++ b/src/nnet/nnet-various.h
@@ -389,7 +389,7 @@ class AddShift : public UpdatableComponent {
     shift_data_.AddVec(-lr * learn_rate_coef_, shift_data_grad_);
   }
 
-  void SetLearnRateCoef(float c) { learn_rate_coef_ = c; }
+  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
 
  protected:
   CuVector<BaseFloat> shift_data_;
@@ -505,7 +505,7 @@ class Rescale : public UpdatableComponent {
     scale_data_.AddVec(-lr * learn_rate_coef_, scale_data_grad_);
   }
 
-  void SetLearnRateCoef(float c) { learn_rate_coef_ = c; }
+  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
 
  protected:
   CuVector<BaseFloat> scale_data_;
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index d1eb85b6d11..debd93599e9 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -162,7 +162,7 @@ struct ChunkTimeInfo {
   // frame appears in multiple chunks, we want to downweight it
   // so that the total weight remains 1.  (Of course, the calling
   // code is free to ignore these weights if desired).
-  std::vector<float> output_weights;
+  std::vector<BaseFloat> output_weights;
 };
 
 
diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index c8c6c2d7905..b3906450525 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -50,6 +50,13 @@ CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
 LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
 CCC="$(mtoken CXX "$CXX")"
 
+# Randomly choose between single and double precision
+if [[ $(( RANDOM % 2 )) == 1 ]] ; then
+  DPF="--double-precision=yes"
+else
+  DPF="--double-precision=no"
+fi
+
 echo "Building tools..." [Time: $(date)]
 runvx cd tools
 runvx make openfst "$CCC" CXXFLAGS="$CF" -j$MAXPAR
@@ -57,8 +64,9 @@ cd ..
 
 echo "Building src..." [Time: $(date)]
 runvx cd src
-runvx "$CCC" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root="$XROOT/usr"
+runvx "$CCC" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no "$DPF" --mathlib=OPENBLAS --openblas-root="$XROOT/usr"
 runvx make all -j$MAXPAR
+runvx make ext -j$MAXPAR
 
 echo "Running tests..." [Time: $(date)]
 runvx make test -k -j$MAXPAR
diff --git a/windows/get_version.pl b/windows/get_version.pl
index 2a54891516a..98d4a6b49e6 100755
--- a/windows/get_version.pl
+++ b/windows/get_version.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 #===============================================================================
 # Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -39,6 +39,6 @@
 };
 
 my $kaldi_ver=<$F>; chomp $kaldi_ver;
-print $H  "KALDI_VERSION=${kaldi_ver}-win\n";
-close($F);
+print $H  "#define KALDI_VERSION \"${kaldi_ver}-win\"\n";
+close($F);
 close($H);

From 8acbbc91cca0a802ad00031774b84169601a4568 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 9 Feb 2017 18:07:19 -0500
Subject: [PATCH 144/213] [src,egs,scripts]: Replace online-nnet3 decoding
 setup with 'looped' decoding and give example script with TDNN+LSTM.

---
 .../local/chain/tuning/run_tdnn_lstm_1b.sh    |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    |   1 +
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |  85 +++++-
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    |   3 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    |   2 +
 egs/wsj/s5/steps/online/nnet3/decode.sh       |  15 +-
 src/itf/decodable-itf.h                       |   2 +-
 src/nnet2/online-nnet2-decodable.cc           |  12 +-
 src/nnet3/Makefile                            |   4 +-
 src/nnet3/decodable-online-looped.cc          | 252 ++++++++++++++++++
 src/nnet3/decodable-online-looped.h           | 199 ++++++++++++++
 src/nnet3/decodable-simple-looped.cc          |  99 ++++---
 src/nnet3/decodable-simple-looped.h           |  36 ++-
 src/nnet3/nnet-am-decodable-simple.h          |   2 +-
 src/nnet3/nnet-compile-looped.cc              |   2 +-
 src/nnet3/online-nnet3-decodable-simple.cc    | 221 ---------------
 src/nnet3/online-nnet3-decodable-simple.h     | 153 -----------
 src/online2/online-nnet2-decoding-threaded.cc |  54 ++--
 src/online2/online-nnet2-feature-pipeline.cc  |   6 -
 src/online2/online-nnet2-feature-pipeline.h   |  46 ++--
 src/online2/online-nnet3-decoding.cc          |  35 +--
 src/online2/online-nnet3-decoding.h           |  64 ++---
 .../online2-wav-nnet2-latgen-faster.cc        |  81 +++---
 .../online2-wav-nnet3-latgen-faster.cc        | 110 ++++----
 24 files changed, 824 insertions(+), 662 deletions(-)
 create mode 100644 src/nnet3/decodable-online-looped.cc
 create mode 100644 src/nnet3/decodable-online-looped.h
 delete mode 100644 src/nnet3/online-nnet3-decodable-simple.cc
 delete mode 100644 src/nnet3/online-nnet3-decodable-simple.h

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
index 5149e5a54e8..eb2c91dc3d4 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -6,7 +6,7 @@
 # and adding
 #    --egs.chunk-left-context-initial=0
 # and  --egs.chunk-right-context-final=0
-
+# See 1e for summary of results.
 
 # steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
 # exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
index 28ca16d939c..4be28a4ca97 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -2,6 +2,7 @@
 
 # 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
 # uses egs from 1b, remember to remove that before I commit.
+# See 1e for summary of results.
 
 # steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
 # exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index 32950e7df6a..6704f9d299e 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -1,6 +1,50 @@
 #!/bin/bash
 
-# 1e is as 1b, but reducing decay-time from 40 to 20.
+# 1e is as 1d, but reducing decay-time from 40 to 20.
+
+# The following table shows comparison of various decay-time values,
+# namely: [b:unset=infinity, f:80, d:40, e:20, g:10, g2:5].
+# note: the g2 script is not checked in.
+# There is no clear trend on the non-looped decoding, but looped decoding seems
+# to improve as decay-time is decreased.  We end up recommending decay-time=20,
+# as by then we get all the improvement on looped decoding, and it's the
+# most conservative setting with which we can get this improvement (although
+# actually it seems fine to use an even smaller decay-time).
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{b,f,d,e,g,g2}_sp_bi
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1b_sp_bi exp/chain_cleaned/tdnn_lstm1f_sp_bi exp/chain_cleaned/tdnn_lstm1d_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1g_sp_bi exp/chain_cleaned/tdnn_lstm1g2_sp_bi
+# System                tdnn_lstm1b_sp_bi tdnn_lstm1f_sp_bi tdnn_lstm1d_sp_bi tdnn_lstm1e_sp_bi tdnn_lstm1g_sp_bi tdnn_lstm1g2_sp_bi
+# WER on dev(orig)            9.1       8.8       9.0       9.0       9.0       9.4
+#         [looped:]           9.4       9.3       9.2       9.0       8.9       9.4
+# WER on dev(rescored)        8.4       8.2       8.4       8.4       8.4       8.7
+#         [looped:]           8.8       8.7       8.6       8.4       8.3       8.7
+# WER on test(orig)           8.9       9.0       8.9       8.8       8.8       9.3
+#         [looped:]           9.3       9.3       9.0       8.8       8.8       9.2
+# WER on test(rescored)       8.4       8.6       8.3       8.4       8.4       8.9
+#         [looped:]           8.7       8.9       8.5       8.3       8.4       8.8
+# Final train prob        -0.0621   -0.0631   -0.0595   -0.0648   -0.0689   -0.0739
+# Final valid prob        -0.0799   -0.0802   -0.0823   -0.0827   -0.0890   -0.0963
+# Final train prob (xent)   -0.8300   -0.8295   -0.8129   -0.8372   -0.8610   -0.8792
+# Final valid prob (xent)   -0.9500   -0.9662   -0.9589   -0.9497   -0.9982   -1.0256
+
+
+# the following table compares the 'online' decoding with regular and looped
+# decoding.  online decoding is a little better than either (possibly due to
+# using slightly later iVectors).
+#
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi{,_online} 2>/dev/null
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_online
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_online
+# WER on dev(orig)            9.0       8.8
+#         [looped:]           9.0
+# WER on dev(rescored)        8.4       8.4
+#         [looped:]           8.4
+# WER on test(orig)           8.8       8.8
+#         [looped:]           8.8
+# WER on test(rescored)       8.4       8.4
+#         [looped:]           8.3
+
 
 # 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
 # uses egs from 1b, remember to remove that before I commit.
@@ -77,6 +121,8 @@ tdnn_lstm_affix=1e  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we
 common_egs_dir=    # you can set this to use previously dumped egs.
 remove_egs=true
 
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -289,8 +335,10 @@ if [ $stage -le 21 ]; then
   # 'looped' decoding.  we didn't write a -parallel version of this program yet,
   # so it will take a bit longer as the --num-threads option is not supported.
   # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
+  # match any value used in training, and it won't affect the results very much (unlike
+  # regular decoding)... [it will affect them slightly due to differences in the
+  # iVector extraction; probably smaller will be worse as it sees less of the future,
+  # but in a real scenario, long chunks will introduce excessive latency].
   rm $dir/.error 2>/dev/null || true
   for dset in dev test; do
       (
@@ -313,4 +361,35 @@ if [ $stage -le 21 ]; then
 fi
 
 
+if $test_online_decoding && [ $stage -le 22 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --extra-left-context-initial $extra_left_context_initial \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
 exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
index ed778713907..3ed14f30956 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# 1f is as 1b, but increasing decay-time from 40 to 80.  [see also 1e, at 20.]
+# 1f is as 1d, but increasing decay-time from 40 to 80.  [see also 1e, at 20.]
+# see 1e for summary of results.
 
 # 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
 # uses egs from 1b, remember to remove that before I commit.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
index bbc17c77aea..aff39a04025 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -2,6 +2,8 @@
 
 #######################
 # 1g is as 1e, but reducing decay-time further from 20 to 10.
+# see 1e for summary of results.
+
 # 1e is as 1b, but reducing decay-time from 40 to 20.
 
 # 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
diff --git a/egs/wsj/s5/steps/online/nnet3/decode.sh b/egs/wsj/s5/steps/online/nnet3/decode.sh
index a4777f1edf7..118cf9e1260 100755
--- a/egs/wsj/s5/steps/online/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/online/nnet3/decode.sh
@@ -8,6 +8,8 @@
 stage=0
 nj=4
 cmd=run.pl
+frames_per_chunk=20
+extra_left_context_initial=0
 min_active=200
 max_active=7000
 beam=15.0
@@ -114,11 +116,6 @@ else
 fi
 
 
-decoder=online2-wav-nnet3-latgen-faster
-parallel_opts=
-opts="--online=$online"
-
-
 if [ "$post_decode_acwt" == 1.0 ]; then
   lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
 else
@@ -132,8 +129,12 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
 fi
 
 if [ $stage -le 0 ]; then
-  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
-    $decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing $frame_subsampling_opt \
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    online2-wav-nnet3-latgen-faster $silence_weighting_opts --do-endpointing=$do_endpointing \
+    --frames-per-chunk=$frames_per_chunk \
+    --extra-left-context-initial=$extra_left_context_initial \
+    --online=$online \
+       $frame_subsampling_opt \
      --config=$online_config \
      --min-active=$min_active --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
      --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
diff --git a/src/itf/decodable-itf.h b/src/itf/decodable-itf.h
index ad3b7809dab..9852861969d 100644
--- a/src/itf/decodable-itf.h
+++ b/src/itf/decodable-itf.h
@@ -112,7 +112,7 @@ class DecodableInterface {
 
   /// Returns the number of states in the acoustic model
   /// (they will be indexed one-based, i.e. from 1 to NumIndices();
-  /// this is for compatibility with OpenFst.
+  /// this is for compatibility with OpenFst).
   virtual int32 NumIndices() const = 0;
 
   virtual ~DecodableInterface() {}
diff --git a/src/nnet2/online-nnet2-decodable.cc b/src/nnet2/online-nnet2-decodable.cc
index 856326cf688..715e1cc280d 100644
--- a/src/nnet2/online-nnet2-decodable.cc
+++ b/src/nnet2/online-nnet2-decodable.cc
@@ -80,7 +80,7 @@ int32 DecodableNnet2Online::NumFramesReady() const {
 
 void DecodableNnet2Online::ComputeForFrame(int32 frame) {
   int32 features_ready = features_->NumFramesReady();
-  bool input_finished = features_->IsLastFrame(features_ready - 1);  
+  bool input_finished = features_->IsLastFrame(features_ready - 1);
   KALDI_ASSERT(frame >= 0);
   if (frame >= begin_frame_ &&
       frame < begin_frame_ + scaled_loglikes_.NumRows())
@@ -112,20 +112,20 @@ void DecodableNnet2Online::ComputeForFrame(int32 frame) {
       t_modified = features_ready - 1;
     features_->GetFrame(t_modified, &row);
   }
-  CuMatrix<BaseFloat> cu_features; 
+  CuMatrix<BaseFloat> cu_features;
   cu_features.Swap(&features);  // Copy to GPU, if we're using one.
-  
+
 
   int32 num_frames_out = input_frame_end - input_frame_begin -
       left_context_ - right_context_;
-  
+
   CuMatrix<BaseFloat> cu_posteriors(num_frames_out, num_pdfs_);
-  
+
   // The "false" below tells it not to pad the input: we've already done
   // any padding that we needed to do.
   NnetComputation(nnet_.GetNnet(), cu_features,
                   false, &cu_posteriors);
-  
+
   cu_posteriors.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
   cu_posteriors.ApplyLog();
   // subtract log-prior (divide by prior)
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index ef50f9960e1..76e0cbbdfbb 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -28,8 +28,8 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   discriminative-supervision.o nnet-discriminative-example.o \
   nnet-discriminative-diagnostics.o \
   discriminative-training.o nnet-discriminative-training.o \
-  online-nnet3-decodable-simple.o nnet-compile-looped.o \
-  decodable-simple-looped.o
+  nnet-compile-looped.o decodable-simple-looped.o \
+  decodable-online-looped.o
 
 
 LIBNAME = kaldi-nnet3
diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
new file mode 100644
index 00000000000..77be1f166bf
--- /dev/null
+++ b/src/nnet3/decodable-online-looped.cc
@@ -0,0 +1,252 @@
+// nnet3/decodable-online-looped.cc
+
+// Copyright  2017  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <nnet3/decodable-online-looped.h>
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+DecodableNnetLoopedOnlineBase::DecodableNnetLoopedOnlineBase(
+    const DecodableNnetSimpleLoopedInfo &info,
+    OnlineFeatureInterface *input_features,
+    OnlineFeatureInterface *ivector_features):
+    num_chunks_computed_(0),
+    current_log_post_subsampled_offset_(-1),
+    info_(info),
+    input_features_(input_features),
+    ivector_features_(ivector_features),
+    computer_(info_.opts.compute_config, info_.computation,
+              info_.nnet, NULL) {   // NULL is 'nnet_to_update'
+  // Check that feature dimensions match.
+  KALDI_ASSERT(input_features_ != NULL);
+  int32 nnet_input_dim = info_.nnet.InputDim("input"),
+      nnet_ivector_dim = info_.nnet.InputDim("ivector"),
+        feat_input_dim = input_features_->Dim(),
+      feat_ivector_dim = (ivector_features_ != NULL ?
+                          ivector_features_->Dim() : -1);
+  if (nnet_input_dim != feat_input_dim) {
+    KALDI_ERR << "Input feature dimension mismatch: got " << feat_input_dim
+              << " but network expects " << nnet_input_dim;
+  }
+  if (nnet_ivector_dim != feat_ivector_dim) {
+    KALDI_ERR << "Ivector feature dimension mismatch: got " << feat_ivector_dim
+              << " but network expects " << nnet_ivector_dim;
+  }
+}
+
+
+int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
+  // note: the ivector_features_ may have 2 or 3 fewer frames ready than
+  // input_features_, but we don't wait for them; we just use the most recent
+  // iVector we can.
+  int32 features_ready = input_features_->NumFramesReady();
+  if (features_ready == 0)
+    return 0;
+  bool input_finished = input_features_->IsLastFrame(features_ready - 1);
+
+  int32 sf = info_.opts.frame_subsampling_factor;
+
+  if (input_finished) {
+    // if the input has finished,... we'll pad with duplicates of the last frame
+    // as needed to get the required right context.
+    return (features_ready + sf - 1) / sf;
+  } else {
+    // note: info_.right_context_ includes both the model context and any
+    // extra_right_context_ (but this
+    int32 non_subsampled_output_frames_ready =
+        std::max<int32>(0, features_ready - info_.frames_right_context);
+    int32 num_chunks_ready = non_subsampled_output_frames_ready /
+                             info_.frames_per_chunk;
+    // note: the division by the frame subsampling factor 'sf' below
+    // doesn't need any attention to rounding because info_.frames_per_chunk
+    // is always a multiple of 'sf' (see 'frames_per_chunk = GetChunksize..."
+    // in decodable-simple-looped.cc).
+    return num_chunks_ready * info_.frames_per_chunk / sf;
+  }
+}
+
+
+// note: the frame-index argument is on the output of the network, i.e. after any
+// subsampling, so we call it 'subsampled_frame'.
+bool DecodableNnetLoopedOnlineBase::IsLastFrame(
+    int32 subsampled_frame) const {
+  // To understand this code, compare it with the code of NumFramesReady(),
+  // it follows the same structure.
+  int32 features_ready = input_features_->NumFramesReady();
+  if (features_ready == 0) {
+    if (subsampled_frame == -1 && input_features_->IsLastFrame(-1)) {
+      // the attempt to handle this rather pathological case (input finished
+      // but no frames ready) is a little quixotic as we have not properly
+      // tested this and other parts of the code may die.
+      return true;
+    } else {
+      return false;
+    }
+  }
+  bool input_finished = input_features_->IsLastFrame(features_ready - 1);
+  if (!input_finished)
+    return false;
+  int32 sf = info_.opts.frame_subsampling_factor,
+     num_subsampled_frames_ready = (features_ready + sf - 1) / sf;
+  return (subsampled_frame == num_subsampled_frames_ready - 1);
+}
+
+
+void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
+  // Prepare the input data for the next chunk of features.
+  // note: 'end' means one past the last.
+  int32 begin_input_frame, end_input_frame;
+  if (num_chunks_computed_ == 0) {
+    begin_input_frame = -info_.frames_left_context;
+    // note: end is last plus one.
+    end_input_frame = info_.frames_per_chunk + info_.frames_right_context;
+  } else {
+    // note: begin_input_frame will be the same as the previous end_input_frame.
+    // you can verify this directly if num_chunks_computed_ == 0, and then by
+    // induction.
+    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk +
+        info_.frames_right_context;
+    end_input_frame = begin_input_frame + info_.frames_per_chunk;
+  }
+
+  int32 num_feature_frames_ready = input_features_->NumFramesReady();
+  bool is_finished = input_features_->IsLastFrame(num_feature_frames_ready - 1);
+
+  if (end_input_frame >= num_feature_frames_ready && !is_finished) {
+    // we shouldn't be attempting to read past the end of the available features
+    // until we have reached the end of the input (i.e. the end-user called
+    // InputFinished(), announcing that there is no more waveform; at this point
+    // we pad as needed with copies of the last frame, to flush out the last of
+    // the output.
+    // If the following error happens, it likely indicates a bug in this
+    // decodable code somewhere (although it could possibly indicate the
+    // user asking for a frame that was not ready, which would be a misuse
+    // of this class.. it can be figured out from gdb as in either case it
+    // would be a bug in the code.
+    KALDI_ERR << "Attempt to access frame past the end of the available input";
+  }
+
+
+  CuMatrix<BaseFloat> feats_chunk;
+  { // this block sets 'feats_chunk'.
+    Matrix<BaseFloat> this_feats(end_input_frame - begin_input_frame,
+                                 input_features_->Dim());
+    for (int32 i = begin_input_frame; i < end_input_frame; i++) {
+      SubVector<BaseFloat> this_row(this_feats, i - begin_input_frame);
+      int32 input_frame = i;
+      if (input_frame < 0) input_frame = 0;
+      if (input_frame >= num_feature_frames_ready)
+        input_frame = num_feature_frames_ready - 1;
+      input_features_->GetFrame(input_frame, &this_row);
+    }
+    feats_chunk.Swap(&this_feats);
+  }
+  computer_.AcceptInput("input", &feats_chunk);
+
+  if (info_.has_ivectors) {
+    KALDI_ASSERT(ivector_features_ != NULL);
+    KALDI_ASSERT(info_.request1.inputs.size() == 2);
+    // all but the 1st chunk should have 1 iVector, but there is no need to
+    // assume this.
+    int32 num_ivectors = (num_chunks_computed_ == 0 ?
+			  info_.request1.inputs[1].indexes.size() :
+			  info_.request2.inputs[1].indexes.size());
+    KALDI_ASSERT(num_ivectors > 0);
+
+    Vector<BaseFloat> ivector(ivector_features_->Dim());
+    // we just get the iVector from the last input frame we needed,
+    // reduced as necessary
+    // we don't bother trying to be 'accurate' in getting the iVectors
+    // for their 'correct' frames, because in general using the
+    // iVector from as large 't' as possible will be better.
+
+    int32 most_recent_input_frame = num_feature_frames_ready - 1,
+      num_ivector_frames_ready = ivector_features_->NumFramesReady();
+
+    if (num_ivector_frames_ready > 0) {
+      int32 ivector_frame_to_use = std::min<int32>(
+          most_recent_input_frame, num_ivector_frames_ready - 1);
+      ivector_features_->GetFrame(ivector_frame_to_use,
+                                  &ivector);
+    }
+    // else just leave the iVector zero (would only happen with very small
+    // chunk-size, like a chunk size of 2 which would be very inefficient; and
+    // only at file begin.
+
+    // note: we expect num_ivectors to be 1 in practice.
+    Matrix<BaseFloat> ivectors(num_ivectors,
+			       ivector.Dim());
+    ivectors.CopyRowsFromVec(ivector);
+    CuMatrix<BaseFloat> cu_ivectors;
+    cu_ivectors.Swap(&ivectors);
+    computer_.AcceptInput("ivector", &cu_ivectors);
+  }
+  computer_.Run();
+
+  {
+    // Note: it's possible in theory that if you had weird recurrence that went
+    // directly from the output, the call to GetOutputDestructive() would cause
+    // a crash on the next chunk.  If that happens, GetOutput() should be used
+    // instead of GetOutputDestructive().  But we don't anticipate this will
+    // happen in practice.
+    CuMatrix<BaseFloat> output;
+    computer_.GetOutputDestructive("output", &output);
+
+    if (info_.log_priors.Dim() != 0) {
+      // subtract log-prior (divide by prior)
+      output.AddVecToRows(-1.0, info_.log_priors);
+    }
+    // apply the acoustic scale
+    output.Scale(info_.opts.acoustic_scale);
+    current_log_post_.Resize(0, 0);
+    current_log_post_.Swap(&output);
+  }
+  KALDI_ASSERT(current_log_post_.NumRows() == info_.frames_per_chunk /
+               info_.opts.frame_subsampling_factor &&
+               current_log_post_.NumCols() == info_.output_dim);
+
+  num_chunks_computed_++;
+
+  current_log_post_subsampled_offset_ =
+      (num_chunks_computed_ - 1) *
+      (info_.frames_per_chunk / info_.opts.frame_subsampling_factor);
+}
+
+BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
+                                                    int32 index) {
+  EnsureFrameIsComputed(subsampled_frame);
+  // note: we index by 'inde
+  return current_log_post_(
+      subsampled_frame - current_log_post_subsampled_offset_,
+      index - 1);
+}
+
+
+BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
+                                                    int32 index) {
+  EnsureFrameIsComputed(subsampled_frame);
+  return current_log_post_(
+      subsampled_frame - current_log_post_subsampled_offset_,
+      trans_model_.TransitionIdToPdf(index));
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
new file mode 100644
index 00000000000..3041d3c4637
--- /dev/null
+++ b/src/nnet3/decodable-online-looped.h
@@ -0,0 +1,199 @@
+// nnet3/decodable-online-looped.h
+
+// Copyright  2014-2017  Johns Hopkins Universithy (author: Daniel Povey)
+//                 2016  Api.ai (Author: Ilya Platonov)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_DECODABLE_ONLINE_LOOPED_H_
+#define KALDI_NNET3_DECODABLE_ONLINE_LOOPED_H_
+
+#include "itf/online-feature-itf.h"
+#include "itf/decodable-itf.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/decodable-simple-looped.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+// The Decodable objects that we define in this header do the neural net
+// computation in a way that's compatible with online feature extraction.  It
+// differs from the one declared in online-nnet3-decodable-simple.h because it
+// uses the 'looped' network evaluation, which is more efficient because it
+// re-uses hidden activations (and therefore doesn't have to pad chunks of data
+// with extra left-context); it is applicable to TDNNs and to forwards-recurrent
+// topologies like LSTMs, but not tobackwards-recurrent topologies such as
+// BLSTMs.
+
+// The options are passed in the same way as in decodable-simple-looped.h,
+// we use the same options and info class.
+
+
+// This object is used as a base class for DecodableNnetLoopedOnline
+// and DecodableAmNnetLoopedOnline.
+// It takes care of the neural net computation and computations related to how
+// many frames are ready (etc.), but it does not override the LogLikelihood() or
+// NumIndices() functions so it is not usable as an object of type
+// DecodableInterface.
+class DecodableNnetLoopedOnlineBase: public DecodableInterface {
+ public:
+  // Constructor.  'input_feature' is for the feature that will be given
+  // as 'input' to the neural network; 'ivector_feature' is for the iVector
+  // feature, or NULL if iVectors are not being used.
+  DecodableNnetLoopedOnlineBase(const DecodableNnetSimpleLoopedInfo &info,
+                                 OnlineFeatureInterface *input_features,
+                                 OnlineFeatureInterface *ivector_features);
+
+  // note: the LogLikelihood function is not overridden; the child
+  // class needs to do this.
+  //virtual BaseFloat LogLikelihood(int32 subsampled_frame, int32 index);
+
+  // note: the frame argument is on the output of the network, i.e. after any
+  // subsampling, so we call it 'subsampled_frame'.
+  virtual bool IsLastFrame(int32 subsampled_frame) const;
+
+  virtual int32 NumFramesReady() const;
+
+  // Note: this function, present in the base-class, is overridden by the child class.
+  // virtual int32 NumIndices() const;
+
+  // this is not part of the standard Decodable interface but I think is needed for
+  // something.
+  int32 FrameSubsamplingFactor() const {
+    return info_.opts.frame_subsampling_factor;
+  }
+
+
+ protected:
+
+  /// If the neural-network outputs for this frame are not cached, this function
+  /// computes them (and possibly also some later frames).  Note:
+  /// the frame-index is called 'subsampled_frame' because if frame-subsampling-factor
+  /// is not 1, it's an index that is "after subsampling", i.e. it changes more
+  /// slowly than the input-feature index.
+  inline void EnsureFrameIsComputed(int32 subsampled_frame) {
+    KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ &&
+                 "Frames must be accessed in order.");
+    while (subsampled_frame >= current_log_post_subsampled_offset_ +
+           current_log_post_.NumRows())
+      AdvanceChunk();
+  }
+
+  // The current log-posteriors that we got from the last time we
+  // ran the computation.
+  Matrix<BaseFloat> current_log_post_;
+
+  // The number of chunks we have computed so far.
+  int32 num_chunks_computed_;
+
+  // The time-offset of the current log-posteriors, equals
+  // (num_chunks_computed_ - 1) *
+  //    (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor).
+  int32 current_log_post_subsampled_offset_;
+
+  const DecodableNnetSimpleLoopedInfo &info_;
+
+ private:
+
+  // This function does the computation for the next chunk.  It will change
+  // current_log_post_ and current_log_post_subsampled_offset_, and
+  // increment num_chunks_computed_.
+  void AdvanceChunk();
+
+  OnlineFeatureInterface *input_features_;
+  OnlineFeatureInterface *ivector_features_;
+
+  NnetComputer computer_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnetLoopedOnlineBase);
+};
+
+// This decodable object takes indexes of the form (pdf_id + 1),
+// or whatever the output-dimension of the neural network represents,
+// plus one.
+// It fully implements DecodableInterface.
+// Note: whether or not division by the prior takes place depends on
+// whether you supplied class AmNnetSimple (or just Nnet), to the constructor
+// of the DecodableNnetSimpleLoopedInfo that you initailized this
+// with.
+class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
+ public:
+  DecodableNnetLoopedOnline(
+      const DecodableNnetSimpleLoopedInfo &info,
+      OnlineFeatureInterface *input_features,
+      OnlineFeatureInterface *ivector_features):
+      DecodableNnetLoopedOnlineBase(info, input_features, ivector_features) { }
+
+
+  // returns the output-dim of the neural net.
+  virtual int32 NumIndices() const { return info_.output_dim; }
+
+  // 'subsampled_frame' is a frame, but if frame-subsampling-factor != 1, it's a
+  // reduced-rate output frame (e.g. a 't' index divided by 3).  'index'
+  // represents the pdf-id (or other output of the network) PLUS ONE.
+  virtual BaseFloat LogLikelihood(int32 subsampled_frame, int32 index);
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnetLoopedOnline);
+
+};
+
+
+// This is for traditional decoding where the graph has transition-ids
+// on the arcs, and you need the TransitionModel to map those to
+// pdf-ids.
+// Note: whether or not division by the prior takes place depends on
+// whether you supplied class AmNnetSimple (or just Nnet), to the constructor
+// of the DecodableNnetSimpleLoopedInfo that you initailized this
+// with.
+class DecodableAmNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
+ public:
+  DecodableAmNnetLoopedOnline(
+      const TransitionModel &trans_model,
+      const DecodableNnetSimpleLoopedInfo &info,
+      OnlineFeatureInterface *input_features,
+      OnlineFeatureInterface *ivector_features):
+      DecodableNnetLoopedOnlineBase(info, input_features, ivector_features),
+      trans_model_(trans_model) { }
+
+
+  // returns the output-dim of the neural net.
+  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
+
+  // 'subsampled_frame' is a frame, but if frame-subsampling-factor != 1, it's a
+  // reduced-rate output frame (e.g. a 't' index divided by 3).
+  virtual BaseFloat LogLikelihood(int32 subsampled_frame,
+                                  int32 transition_id);
+
+ private:
+  const TransitionModel &trans_model_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetLoopedOnline);
+
+};
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_DECODABLE_ONLINE_LOOPED_H_
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index bb9a38632a1..df18d605b7d 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -28,7 +28,7 @@ namespace nnet3 {
 DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
     const NnetSimpleLoopedComputationOptions &opts,
     Nnet *nnet):
-    opts_(opts), nnet_(*nnet) {
+    opts(opts), nnet(*nnet) {
   Init(opts, nnet);
 }
 
@@ -36,9 +36,9 @@ DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
     const NnetSimpleLoopedComputationOptions &opts,
     const Vector<BaseFloat> &priors,
     Nnet *nnet):
-    opts_(opts), nnet_(*nnet), log_priors_(priors) {
-  if (log_priors_.Dim() != 0)
-    log_priors_.ApplyLog();
+    opts(opts), nnet(*nnet), log_priors(priors) {
+  if (log_priors.Dim() != 0)
+    log_priors.ApplyLog();
   Init(opts, nnet);
 }
 
@@ -46,9 +46,9 @@ DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
 DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
     const NnetSimpleLoopedComputationOptions &opts,
     AmNnetSimple *am_nnet):
-    opts_(opts), nnet_(am_nnet->GetNnet()), log_priors_(am_nnet->Priors()) {
-  if (log_priors_.Dim() != 0)
-    log_priors_.ApplyLog();
+    opts(opts), nnet(am_nnet->GetNnet()), log_priors(am_nnet->Priors()) {
+  if (log_priors.Dim() != 0)
+    log_priors.ApplyLog();
   Init(opts, &(am_nnet->GetNnet()));
 }
 
@@ -58,35 +58,36 @@ void DecodableNnetSimpleLoopedInfo::Init(
     Nnet *nnet) {
   opts.Check();
   KALDI_ASSERT(IsSimpleNnet(*nnet));
-  has_ivectors_ = (nnet->InputDim("ivector") > 0);
+  has_ivectors = (nnet->InputDim("ivector") > 0);
   int32 left_context, right_context;
   ComputeSimpleNnetContext(*nnet, &left_context, &right_context);
-  frames_left_context_ = left_context + opts.extra_left_context_initial;
-  frames_right_context_ = right_context;
-  frames_per_chunk_ = GetChunkSize(*nnet, opts_.frame_subsampling_factor,
-                                   opts.frames_per_chunk);
-  output_dim_ = nnet->OutputDim("output");
-  KALDI_ASSERT(output_dim_ > 0);
+  frames_left_context = left_context + opts.extra_left_context_initial;
+  frames_right_context = right_context;
+  frames_per_chunk = GetChunkSize(*nnet, opts.frame_subsampling_factor,
+                                  opts.frames_per_chunk);
+  output_dim = nnet->OutputDim("output");
+  KALDI_ASSERT(output_dim > 0);
   // note, ivector_period is hardcoded to the same as frames_per_chunk_.
-  int32 ivector_period = frames_per_chunk_;
-  if (has_ivectors_)
+  int32 ivector_period = frames_per_chunk;
+  if (has_ivectors)
     ModifyNnetIvectorPeriod(ivector_period, nnet);
 
   int32 num_sequences = 1;  // we're processing one utterance at a time.
   int32 extra_right_context = 0;
-  CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk_,
-                                       opts_.frame_subsampling_factor,
-                                       ivector_period, opts.extra_left_context_initial,
+  CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk,
+                                       opts.frame_subsampling_factor,
+                                       ivector_period,
+                                       opts.extra_left_context_initial,
                                        extra_right_context,
                                        num_sequences,
-                                       &request1_, &request2_, &request3_);
+                                       &request1, &request2, &request3);
 
-  CompileLooped(*nnet, opts_.optimize_config, request1_, request2_, request3_,
-                &computation_);
-  computation_.ComputeCudaIndexes();
+  CompileLooped(*nnet, opts.optimize_config, request1, request2, request3,
+                &computation);
+  computation.ComputeCudaIndexes();
   if (GetVerboseLevel() >= 3) {
     KALDI_VLOG(3) << "Computation is:";
-    computation_.Print(std::cerr, *nnet);
+    computation.Print(std::cerr, *nnet);
   }
 }
 
@@ -98,16 +99,16 @@ DecodableNnetSimpleLooped::DecodableNnetSimpleLooped(
     const MatrixBase<BaseFloat> *online_ivectors,
     int32 online_ivector_period):
     info_(info),
-    computer_(info_.opts_.compute_config, info_.computation_,
-              info_.nnet_, NULL),
+    computer_(info_.opts.compute_config, info_.computation,
+              info_.nnet, NULL),  // NULL is 'nnet_to_update'
     feats_(feats),
     ivector_(ivector), online_ivector_feats_(online_ivectors),
     online_ivector_period_(online_ivector_period),
     num_chunks_computed_(0),
     current_log_post_subsampled_offset_(-1) {
   num_subsampled_frames_ =
-      (feats_.NumRows() + info_.opts_.frame_subsampling_factor - 1) /
-      info_.opts_.frame_subsampling_factor;
+      (feats_.NumRows() + info_.opts.frame_subsampling_factor - 1) /
+      info_.opts.frame_subsampling_factor;
   KALDI_ASSERT(!(ivector != NULL && online_ivectors != NULL));
   KALDI_ASSERT(!(online_ivectors != NULL && online_ivector_period <= 0 &&
                  "You need to set the --online-ivector-period option!"));
@@ -138,13 +139,13 @@ int32 DecodableNnetSimpleLooped::GetIvectorDim() const {
 void DecodableNnetSimpleLooped::AdvanceChunk() {
   int32 begin_input_frame, end_input_frame;
   if (num_chunks_computed_ == 0) {
-    begin_input_frame = -info_.frames_left_context_;
+    begin_input_frame = -info_.frames_left_context;
     // note: end is last plus one.
-    end_input_frame = info_.frames_per_chunk_ + info_.frames_right_context_;
+    end_input_frame = info_.frames_per_chunk + info_.frames_right_context;
   } else {
-    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk_ +
-        info_.frames_right_context_;
-    end_input_frame = begin_input_frame + info_.frames_per_chunk_;
+    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk +
+        info_.frames_right_context;
+    end_input_frame = begin_input_frame + info_.frames_per_chunk;
   }
   CuMatrix<BaseFloat> feats_chunk(end_input_frame - begin_input_frame,
                                   feats_.NumCols(), kUndefined);
@@ -170,13 +171,13 @@ void DecodableNnetSimpleLooped::AdvanceChunk() {
   }
   computer_.AcceptInput("input", &feats_chunk);
 
-  if (info_.has_ivectors_) {
-    KALDI_ASSERT(info_.request1_.inputs.size() == 2);
+  if (info_.has_ivectors) {
+    KALDI_ASSERT(info_.request1.inputs.size() == 2);
     // all but the 1st chunk should have 1 iVector, but no need
     // to assume this.
     int32 num_ivectors = (num_chunks_computed_ == 0 ?
-			  info_.request1_.inputs[1].indexes.size() :
-			  info_.request2_.inputs[1].indexes.size());
+			  info_.request1.inputs[1].indexes.size() :
+			  info_.request2.inputs[1].indexes.size());
     KALDI_ASSERT(num_ivectors > 0);
 
     Vector<BaseFloat> ivector;
@@ -194,40 +195,38 @@ void DecodableNnetSimpleLooped::AdvanceChunk() {
   computer_.Run();
 
   {
-    // on GPU if we're using one, while avoiding unnecessary copies if we're not
-    // using the GPU.
-
     // Note: it's possible in theory that if you had weird recurrence that went
     // directly from the output, the call to GetOutputDestructive() would cause
-    // a crash on the next chunk.  But we don't anticipate this will happen in
-    // practice.
+    // a crash on the next chunk.  If that happens, GetOutput() should be used
+    // instead of GetOutputDestructive().  But we don't anticipate this will
+    // happen in practice.
     CuMatrix<BaseFloat> output;
     computer_.GetOutputDestructive("output", &output);
 
-    if (info_.log_priors_.Dim() != 0) {
+    if (info_.log_priors.Dim() != 0) {
       // subtract log-prior (divide by prior)
-      output.AddVecToRows(-1.0, info_.log_priors_);
+      output.AddVecToRows(-1.0, info_.log_priors);
     }
     // apply the acoustic scale
-    output.Scale(info_.opts_.acoustic_scale);
+    output.Scale(info_.opts.acoustic_scale);
     current_log_post_.Resize(0, 0);
     current_log_post_.Swap(&output);
   }
-  KALDI_ASSERT(current_log_post_.NumRows() == info_.frames_per_chunk_ /
-               info_.opts_.frame_subsampling_factor &&
-               current_log_post_.NumCols() == info_.output_dim_);
+  KALDI_ASSERT(current_log_post_.NumRows() == info_.frames_per_chunk /
+               info_.opts.frame_subsampling_factor &&
+               current_log_post_.NumCols() == info_.output_dim);
 
   num_chunks_computed_++;
 
   current_log_post_subsampled_offset_ =
       (num_chunks_computed_ - 1) *
-      (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor);
+      (info_.frames_per_chunk / info_.opts.frame_subsampling_factor);
 }
 
 
 void DecodableNnetSimpleLooped::GetCurrentIvector(int32 input_frame,
                                                   Vector<BaseFloat> *ivector) {
-  if (!info_.has_ivectors_)
+  if (!info_.has_ivectors)
     return;
   if (ivector_ != NULL) {
     *ivector = *ivector_;
diff --git a/src/nnet3/decodable-simple-looped.h b/src/nnet3/decodable-simple-looped.h
index 5aba5b10505..ca3f732641e 100644
--- a/src/nnet3/decodable-simple-looped.h
+++ b/src/nnet3/decodable-simple-looped.h
@@ -51,7 +51,6 @@ struct NnetSimpleLoopedComputationOptions {
   bool debug_computation;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
-
   NnetSimpleLoopedComputationOptions():
       extra_left_context_initial(0),
       frame_subsampling_factor(1),
@@ -95,9 +94,6 @@ struct NnetSimpleLoopedComputationOptions {
   }
 };
 
-// forward declaration.
-class DecodableNnetSimpleLooped;
-
 
 /**
    When you instantiate class DecodableNnetSimpleLooped, you should give it
@@ -110,6 +106,8 @@ class DecodableNnetSimpleLoopedInfo  {
   DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
                                 Nnet *nnet);
 
+  // This constructor takes the priors from class AmNnetSimple (so it can divide by
+  // them).
   DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
                                 AmNnetSimple *nnet);
 
@@ -118,43 +116,41 @@ class DecodableNnetSimpleLoopedInfo  {
                                 const Vector<BaseFloat> &priors,
                                 Nnet *nnet);
 
- protected:
   void Init(const NnetSimpleLoopedComputationOptions &opts,
             Nnet *nnet);
 
-  friend class DecodableNnetSimpleLooped;
-
+  const NnetSimpleLoopedComputationOptions &opts;
 
-  const NnetSimpleLoopedComputationOptions &opts_;
-  const Nnet &nnet_;
+  const Nnet &nnet;
 
   // the log priors (or the empty vector if the priors are not set in the model)
-  CuVector<BaseFloat> log_priors_;
+  CuVector<BaseFloat> log_priors;
 
 
-  // frames_left_context equals the model left context plus any extra left context.
-  int32 frames_left_context_;
+  // frames_left_context equals the model left context plus the value of the
+  // --extra-left-context-initial option.
+  int32 frames_left_context;
   // frames_right_context is the same as the right-context of the model.
-  int32 frames_right_context_;
+  int32 frames_right_context;
   // The frames_per_chunk_ equals the number of input frames we need for each
   // chunk (except for the first chunk).  This divided by
   // opts_.frame_subsampling_factor gives the number of output frames.
-  int32 frames_per_chunk_;
+  int32 frames_per_chunk;
 
   // The output dimension of the neural network.
-  int32 output_dim_;
+  int32 output_dim;
 
   // True if the neural net accepts iVectors.  If so, the neural net will have been modified
   // to accept the iVectors
-  bool has_ivectors_;
+  bool has_ivectors;
 
   // The 3 computation requests that are used to create the looped
   // computation are stored in the class, as we need them to work out
   // exactly shich iVectors are needed.
-  ComputationRequest request1_, request2_, request3_;
-  
+  ComputationRequest request1, request2, request3;
+
   // The compiled, 'looped' computation.
-  NnetComputation computation_;
+  NnetComputation computation;
 };
 
 /*
@@ -197,7 +193,7 @@ class DecodableNnetSimpleLooped {
   // 1).
   inline int32 NumFrames() const { return num_subsampled_frames_; }
 
-  inline int32 OutputDim() const { return info_.output_dim_; }
+  inline int32 OutputDim() const { return info_.output_dim; }
 
   // Gets the output for a particular frame, with 0 <= frame < NumFrames().
   // 'output' must be correctly sized (with dimension OutputDim()).  Note:
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index 6b382fbe033..1895303d125 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -328,7 +328,7 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
             CachingOptimizingCompiler-- because making that thread safe
             would be quite complicated, and in any case multi-threaded
             decoding probably makes the most sense when using CPU, and
-            in that case won't expect the compilation phase to dominate.
+            in that case we don't expect the compilation phase to dominate.
 
      This constructor takes features as input, and you can either supply a
      single iVector input, estimated in batch-mode ('ivector'), or 'online'
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index 1237ba6ce1e..70f88615ab9 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -321,7 +321,7 @@ void CompileLooped(const Nnet &nnet,
                              request1, request2, request3,
                              num_requests, computation)) {
       KALDI_LOG << "Spent " << timer.Elapsed()
-                << " seconds in looped nnet3 compilation.";
+                << " seconds in looped compilation.";
       return;
     } else {
       KALDI_VLOG(2) << "Looped compilation failed with "
diff --git a/src/nnet3/online-nnet3-decodable-simple.cc b/src/nnet3/online-nnet3-decodable-simple.cc
deleted file mode 100644
index 010dc80991a..00000000000
--- a/src/nnet3/online-nnet3-decodable-simple.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-// nnet3/online-nnet3-decodable.cc
-
-// Copyright  2014  Johns Hopkins University (author: Daniel Povey)
-//            2016  Api.ai (Author: Ilya Platonov)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <nnet3/online-nnet3-decodable-simple.h>
-#include "nnet3/nnet-utils.h"
-
-namespace kaldi {
-namespace nnet3 {
-
-DecodableNnet3SimpleOnline::DecodableNnet3SimpleOnline(
-    const AmNnetSimple &am_nnet,
-    const TransitionModel &trans_model,
-    const DecodableNnet3OnlineOptions &opts,
-    OnlineFeatureInterface *input_feats):
-    compiler_(am_nnet.GetNnet(), opts.optimize_config),
-    features_(input_feats),
-    am_nnet_(am_nnet),
-    trans_model_(trans_model),
-    opts_(opts),
-    feat_dim_(input_feats->Dim()),
-    num_pdfs_(am_nnet.GetNnet().OutputDim("output")),
-    begin_frame_(-1) {
-  KALDI_ASSERT(opts_.max_nnet_batch_size > 0);
-  log_priors_ = am_nnet_.Priors();
-  KALDI_ASSERT((log_priors_.Dim() == 0 || log_priors_.Dim() == trans_model_.NumPdfs()) &&
-               "Priors in neural network must match with transition model (if exist).");
-
-  ComputeSimpleNnetContext(am_nnet_.GetNnet(), &left_context_, &right_context_);
-  log_priors_.ApplyLog();
-
-  // Check that the dimensions are correct.
-  int32 input_dim = am_nnet_.GetNnet().InputDim("input");
-  int32 ivector_dim = std::max<int32>(0, am_nnet_.GetNnet().InputDim("ivector"));
-  // We use feature extraction code that was designed for nnet2, which just
-  // appends the mfcc and ivector features.  So here we have to separate them
-  // again.  This code just checks that the dimension is as we expect.
-  int32 feature_dim = features_->Dim();
-  if (feature_dim != input_dim + ivector_dim) {
-    KALDI_ERR << "Dimension of features " << feature_dim << " does not equal "
-              << "input dim " << input_dim << " + ivector dim " << ivector_dim
-              << " of neural network.  Likely the config and neural net "
-              << "mismatch.";
-  }
-}
-
-
-
-BaseFloat DecodableNnet3SimpleOnline::LogLikelihood(int32 frame, int32 index) {
-  ComputeForFrame(frame);
-  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
-  KALDI_ASSERT(frame >= begin_frame_ &&
-               frame < begin_frame_ + scaled_loglikes_.NumRows());
-  return scaled_loglikes_(frame - begin_frame_, pdf_id);
-}
-
-
-bool DecodableNnet3SimpleOnline::IsLastFrame(int32 frame) const {
-  KALDI_ASSERT(false && "Method is not imlemented");
-  return false;
-}
-
-int32 DecodableNnet3SimpleOnline::NumFramesReady() const {
-  int32 features_ready = features_->NumFramesReady();
-  if (features_ready == 0)
-    return 0;
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  if (opts_.pad_input) {
-    // normal case... we'll pad with duplicates of first + last frame to get the
-    // required left and right context.
-    if (input_finished) return NumSubsampledFrames(features_ready);
-    else return std::max<int32>(0, NumSubsampledFrames(features_ready - right_context_));
-  } else {
-    return std::max<int32>(0, NumSubsampledFrames(features_ready - right_context_ - left_context_));
-  }
-}
-
-int32 DecodableNnet3SimpleOnline::NumSubsampledFrames(int32 num_frames) const {
-  return (num_frames) / opts_.frame_subsampling_factor;
-}
-
-void DecodableNnet3SimpleOnline::ComputeForFrame(int32 subsampled_frame) {
-  int32 features_ready = features_->NumFramesReady();
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  KALDI_ASSERT(subsampled_frame >= 0);
-  if (subsampled_frame >= begin_frame_ &&
-      subsampled_frame < begin_frame_ + scaled_loglikes_.NumRows())
-    return;
-  KALDI_ASSERT(subsampled_frame < NumFramesReady());
-
-  int32 subsample = opts_.frame_subsampling_factor;
-
-  int32 input_frame_begin;
-  if (opts_.pad_input)
-    input_frame_begin = subsampled_frame * subsample  - left_context_;
-  else
-    input_frame_begin = subsampled_frame * subsample;
-  int32 max_possible_input_frame_end = features_ready;
-  if (input_finished && opts_.pad_input)
-    max_possible_input_frame_end += right_context_;
-  int32 input_frame_end = std::min<int32>(max_possible_input_frame_end,
-                                          input_frame_begin +
-                                          left_context_ + right_context_ +
-                                          opts_.max_nnet_batch_size);
-  KALDI_ASSERT(input_frame_end > input_frame_begin);
-  Matrix<BaseFloat> features(input_frame_end - input_frame_begin,
-                             feat_dim_);
-  for (int32 t = input_frame_begin; t < input_frame_end; t++) {
-    SubVector<BaseFloat> row(features, t - input_frame_begin);
-    int32 t_modified = t;
-    // The next two if-statements take care of "pad_input"
-    if (t_modified < 0)
-      t_modified = 0;
-    if (t_modified >= features_ready)
-      t_modified = features_ready - 1;
-    features_->GetFrame(t_modified, &row);
-  }
-
-  int32 num_subsampled_frames = NumSubsampledFrames(input_frame_end - input_frame_begin -
-          left_context_ - right_context_);
-  int32 mfcc_dim = am_nnet_.GetNnet().InputDim("input");
-  int32 ivector_dim = am_nnet_.GetNnet().InputDim("ivector");
-  // MFCCs in the left chunk
-  SubMatrix<BaseFloat> mfcc_mat(features.ColRange(0, mfcc_dim));
-
-  Vector<BaseFloat> input_ivector;
-  if(ivector_dim != -1){
-    // iVectors in the right chunk
-    KALDI_ASSERT(features.NumCols() == mfcc_dim + ivector_dim && "Mismatch in features dim");
-    SubMatrix<BaseFloat> ivector_mat(features.ColRange(mfcc_dim, ivector_dim));
-    // Get last ivector... not sure if GetCurrentIvector is needed in the online context
-    // I think it should work fine just getting the last row for testing
-    input_ivector = ivector_mat.Row(ivector_mat.NumRows() - 1);
-  }
-
-  DoNnetComputation(input_frame_begin,
-    mfcc_mat, input_ivector, subsampled_frame * subsample, num_subsampled_frames);
-
-  begin_frame_ = subsampled_frame;
-}
-
-void DecodableNnet3SimpleOnline::DoNnetComputation(
-    int32 input_t_start,
-    const MatrixBase<BaseFloat> &input_feats,
-    const VectorBase<BaseFloat> &ivector,
-    int32 output_t_start,
-    int32 num_subsampled_frames) {
-  ComputationRequest request;
-  request.need_model_derivative = false;
-  request.store_component_stats = false;
-
-  bool shift_time = true; // shift the 'input' and 'output' to a consistent
-                          // time, to take advantage of caching in the compiler.
-                          // An optimization.
-  int32 time_offset = (shift_time ? -output_t_start : 0);
-
-  // First add the regular features-- named "input".
-  request.inputs.reserve(2);
-  request.inputs.push_back(
-      IoSpecification("input", time_offset + input_t_start,
-                      time_offset + input_t_start + input_feats.NumRows()));
-  if (ivector.Dim() != 0) {
-    std::vector<Index> indexes;
-    indexes.push_back(Index(0, 0, 0));
-    request.inputs.push_back(IoSpecification("ivector", indexes));
-  }
-  IoSpecification output_spec;
-  output_spec.name = "output";
-  output_spec.has_deriv = false;
-  int32 subsample = opts_.frame_subsampling_factor;
-  output_spec.indexes.resize(num_subsampled_frames);
-  // leave n and x values at 0 (the constructor sets these).
-  for (int32 i = 0; i < num_subsampled_frames; i++)
-    output_spec.indexes[i].t = time_offset + output_t_start + i * subsample;
-  request.outputs.resize(1);
-  request.outputs[0].Swap(&output_spec);
-
-  const NnetComputation *computation = compiler_.Compile(request);
-  Nnet *nnet_to_update = NULL;  // we're not doing any update.
-  NnetComputer computer(opts_.compute_config, *computation,
-                        am_nnet_.GetNnet(), nnet_to_update);
-
-  CuMatrix<BaseFloat> input_feats_cu(input_feats);
-  computer.AcceptInput("input", &input_feats_cu);
-  CuMatrix<BaseFloat> ivector_feats_cu;
-  if (ivector.Dim() > 0) {
-    ivector_feats_cu.Resize(1, ivector.Dim());
-    ivector_feats_cu.Row(0).CopyFromVec(ivector);
-    computer.AcceptInput("ivector", &ivector_feats_cu);
-  }
-  computer.Run();
-  CuMatrix<BaseFloat> cu_output;
-  computer.GetOutputDestructive("output", &cu_output);
-  // subtract log-prior (divide by prior)
-  if (log_priors_.Dim() != 0)
-    cu_output.AddVecToRows(-1.0, log_priors_);
-  // apply the acoustic scale
-  cu_output.Scale(opts_.acoustic_scale);
-  scaled_loglikes_.Resize(0, 0);
-  // the following statement just swaps the pointers if we're not using a GPU.
-  cu_output.Swap(&scaled_loglikes_);
-}
-
-} // namespace nnet3
-} // namespace kaldi
diff --git a/src/nnet3/online-nnet3-decodable-simple.h b/src/nnet3/online-nnet3-decodable-simple.h
deleted file mode 100644
index af7c18da64b..00000000000
--- a/src/nnet3/online-nnet3-decodable-simple.h
+++ /dev/null
@@ -1,153 +0,0 @@
-// nnet3/online-nnet3-decodable-simple.h
-
-// Copyright  2014  Johns Hopkins Universithy (author: Daniel Povey)
-//            2016  Api.ai (Author: Ilya Platonov)
-
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
-#define KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
-
-#include "itf/online-feature-itf.h"
-#include "itf/decodable-itf.h"
-#include "nnet3/am-nnet-simple.h"
-#include "nnet3/nnet-compute.h"
-#include "nnet3/nnet-optimize.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet3 {
-
-// Note: see also nnet-compute-online.h, which provides a different
-// (lower-level) interface and more efficient for progressive evaluation of an
-// nnet throughout an utterance, with re-use of already-computed activations.
-
-struct DecodableNnet3OnlineOptions {
-  int32 frame_subsampling_factor;
-  BaseFloat acoustic_scale;
-  bool pad_input;
-  int32 max_nnet_batch_size;
-  NnetComputeOptions compute_config;
-  NnetOptimizeOptions optimize_config;
-
-  DecodableNnet3OnlineOptions():
-      frame_subsampling_factor(1),
-      acoustic_scale(0.1),
-      pad_input(true),
-      max_nnet_batch_size(256) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("acoustic-scale", &acoustic_scale,
-                   "Scaling factor for acoustic likelihoods");
-    opts->Register("pad-input", &pad_input,
-                   "If true, pad acoustic features with required acoustic context "
-                   "past edges of file.");
-    opts->Register("max-nnet-batch-size", &max_nnet_batch_size,
-                   "Maximum batch size we use in neural-network decodable object, "
-                   "in cases where we are not constrained by currently available "
-                   "frames (this will rarely make a difference)");
-
-    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
-                   "Required if the frame-rate of the output (e.g. in 'chain' "
-                   "models) is less than the frame-rate of the original "
-                   "alignment.");
-
-    // register the optimization options with the prefix "optimization".
-    ParseOptions optimization_opts("optimization", opts);
-    optimize_config.Register(&optimization_opts);
-
-    // register the compute options with the prefix "computation".
-    ParseOptions compute_opts("computation", opts);
-    compute_config.Register(&compute_opts);
-
-  }
-};
-
-
-/**
-   This Decodable object for class nnet3::AmNnetSimple takes feature input from class
-   OnlineFeatureInterface, unlike, say, class DecodableAmNnet which takes
-   feature input from a matrix.
-*/
-
-class DecodableNnet3SimpleOnline: public DecodableInterface {
- public:
-  DecodableNnet3SimpleOnline(const AmNnetSimple &am_nnet,
-                             const TransitionModel &trans_model,
-                             const DecodableNnet3OnlineOptions &opts,
-                             OnlineFeatureInterface *input_feats);
-
-
-  /// Returns the scaled log likelihood
-  virtual BaseFloat LogLikelihood(int32 frame, int32 index);
-
-  virtual bool IsLastFrame(int32 frame) const;
-
-  virtual int32 NumFramesReady() const;
-
-  /// Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  int32 FrameSubsamplingFactor() const { return opts_.frame_subsampling_factor; }
- private:
-
-  /// If the neural-network outputs for this frame are not cached, it computes
-  /// them (and possibly for some succeeding frames)
-  void ComputeForFrame(int32 frame);
-  // corrects number of frames by frame_subsampling_factor;
-  int32 NumSubsampledFrames(int32) const;
-
-  void DoNnetComputation(
-      int32 input_t_start,
-      const MatrixBase<BaseFloat> &input_feats,
-      const VectorBase<BaseFloat> &ivector,
-      int32 output_t_start,
-      int32 num_subsampled_frames);
-
-  CachingOptimizingCompiler compiler_;
-
-  OnlineFeatureInterface *features_;
-  const AmNnetSimple &am_nnet_;
-  const TransitionModel &trans_model_;
-  DecodableNnet3OnlineOptions opts_;
-  CuVector<BaseFloat> log_priors_;  // log-priors taken from the model.
-  int32 feat_dim_;  // dimensionality of the input features.
-  int32 left_context_;  // Left context of the network (cached here)
-  int32 right_context_;  // Right context of the network (cached here)
-  int32 num_pdfs_;  // Number of pdfs, equals output-dim of the network (cached
-                    // here)
-
-  int32 begin_frame_;  // First frame for which scaled_loglikes_ is valid
-                       // (i.e. the first frame of the batch of frames for
-                       // which we've computed the output).
-
-  // scaled_loglikes_ contains the neural network pseudo-likelihoods: the log of
-  // (prob divided by the prior), scaled by opts.acoustic_scale).  We may
-  // compute this using the GPU, but we transfer it back to the system memory
-  // when we store it here.  These scores are only kept for a subset of frames,
-  // starting at begin_frame_, whose length depends how many frames were ready
-  // at the time we called LogLikelihood(), and will never exceed
-  // opts_.max_nnet_batch_size.
-  Matrix<BaseFloat> scaled_loglikes_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnet3SimpleOnline);
-};
-
-} // namespace nnet3
-} // namespace kaldi
-
-#endif // KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
diff --git a/src/online2/online-nnet2-decoding-threaded.cc b/src/online2/online-nnet2-decoding-threaded.cc
index 09c9a4f6f0b..feb711df904 100644
--- a/src/online2/online-nnet2-decoding-threaded.cc
+++ b/src/online2/online-nnet2-decoding-threaded.cc
@@ -26,7 +26,7 @@
 namespace kaldi {
 
 ThreadSynchronizer::ThreadSynchronizer():
-    abort_(false), 
+    abort_(false),
     producer_waiting_(false),
     consumer_waiting_(false),
     num_errors_(0) {
@@ -67,8 +67,8 @@ bool ThreadSynchronizer::UnlockSuccess(ThreadType t) {
       producer_semaphore_.Signal();
       producer_waiting_ = false;
     }
-    
-  }  
+
+  }
   mutex_.Unlock();
   return !abort_;
 }
@@ -192,7 +192,7 @@ void SingleUtteranceNnet2DecoderThreaded::AcceptWaveform(
     KALDI_ASSERT(sampling_rate == sampling_rate_);
   }
   num_samples_received_ += wave_part.Dim();
-  
+
   if (wave_part.Dim() == 0) return;
   if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
     KALDI_ERR << "Failure locking mutex: decoding aborted.";
@@ -310,9 +310,9 @@ void SingleUtteranceNnet2DecoderThreaded::GetAdaptationState(
     OnlineIvectorExtractorAdaptationState *adaptation_state) {
   feature_pipeline_mutex_.Lock();  // If this blocks, it shouldn't be for very long.
   feature_pipeline_.GetAdaptationState(adaptation_state);
-  feature_pipeline_mutex_.Unlock();  // If this blocks, it won't be for very long.  
+  feature_pipeline_mutex_.Unlock();  // If this blocks, it won't be for very long.
 }
-  
+
 void SingleUtteranceNnet2DecoderThreaded::GetLattice(
     bool end_of_utterance,
     CompactLattice *clat,
@@ -324,7 +324,7 @@ void SingleUtteranceNnet2DecoderThreaded::GetLattice(
   if (final_relative_cost != NULL)
     *final_relative_cost = decoder_.FinalRelativeCost();
   if (decoder_.NumFramesDecoded() == 0) {
-    const_cast<Mutex&>(decoder_mutex_).Unlock();    
+    const_cast<Mutex&>(decoder_mutex_).Unlock();
     clat->SetFinal(clat->AddState(),
                    CompactLatticeWeight::One());
     return;
@@ -332,7 +332,7 @@ void SingleUtteranceNnet2DecoderThreaded::GetLattice(
   Lattice raw_lat;
   decoder_.GetRawLattice(&raw_lat, end_of_utterance);
   const_cast<Mutex&>(decoder_mutex_).Unlock();
-  
+
   if (!config_.decoder_opts.determinize_lattice)
     KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
 
@@ -354,7 +354,7 @@ void SingleUtteranceNnet2DecoderThreaded::GetBestPath(
     best_path->DeleteStates();
     best_path->SetFinal(best_path->AddState(),
                         LatticeWeight::One());
-    if (final_relative_cost != NULL)    
+    if (final_relative_cost != NULL)
       *final_relative_cost = std::numeric_limits<BaseFloat>::infinity();
   } else {
     decoder_.GetBestPath(best_path,
@@ -447,7 +447,7 @@ void SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(
 // locked feature_pipeline_mutex_.
 bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
     int32 num_frames_consumed) {
-  
+
   int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
       num_frames_usable = num_frames_ready - num_frames_consumed;
   bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
@@ -457,7 +457,7 @@ bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
   } else {
     if (num_frames_usable >= config_.nnet_batch_size)
       return true;  // We don't need more data yet.
-    
+
     // Now try to get more data, if we can.
     if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kConsumer)) {
       return false;
@@ -506,12 +506,12 @@ bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
 bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
   // if any of the Lock/Unlock functions return false, it's because AbortAllThreads()
   // was called.
-  
+
   // This object is responsible for keeping track of the context, and avoiding
   // re-computing things we've already computed.
   bool pad_input = true;
   nnet2::NnetOnlineComputer computer(am_nnet_.GetNnet(), pad_input);
-  
+
   // we declare the following as CuVector just to enable GPU support, but
   // we expect this code to be run on CPU in the normal case.
   CuVector<BaseFloat> log_inv_prior(am_nnet_.Priors());
@@ -525,7 +525,7 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
   // has produced, which may be less than num_frames_consumed due to the
   // right-context of the network.
   int32 num_frames_consumed = 0, num_frames_output = 0;
-  
+
   while (true) {
     bool last_time = false;
 
@@ -536,19 +536,21 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
       return false;
     }
     // take care of silence weighting.
-    if (silence_weighting_.Active()) {
+    if (silence_weighting_.Active() &&
+        feature_pipeline_.IvectorFeature() != NULL) {
       silence_weighting_mutex_.Lock();
       std::vector<std::pair<int32, BaseFloat> > delta_weights;
-      silence_weighting_.GetDeltaWeights(feature_pipeline_.NumFramesReady(),
-                                         &delta_weights);
+      silence_weighting_.GetDeltaWeights(
+          feature_pipeline_.IvectorFeature()->NumFramesReady(),
+          &delta_weights);
       silence_weighting_mutex_.Unlock();
-      feature_pipeline_.UpdateFrameWeights(delta_weights);
+      feature_pipeline_.IvectorFeature()->UpdateFrameWeights(delta_weights);
     }
-    
+
     int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
         num_frames_usable = num_frames_ready - num_frames_consumed;
     bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
-      
+
     int32 num_frames_evaluate = std::min<int32>(num_frames_usable,
                                                 config_.nnet_batch_size);
 
@@ -563,10 +565,10 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
       }
     }
     /****** End locking of feature pipeline mutex. ******/
-    feature_pipeline_mutex_.Unlock();  
+    feature_pipeline_mutex_.Unlock();
 
     CuMatrix<BaseFloat> cu_loglikes;
-    
+
     if (feats.NumRows() == 0) {
       if (features_done) {
         // flush out the last few frames.  Note: this is the only place from
@@ -587,7 +589,7 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
       num_frames_consumed += cu_feats.NumRows();
       ProcessLoglikes(log_inv_prior, &cu_loglikes);
     }
-    
+
     Matrix<BaseFloat> loglikes;
     loglikes.Swap(&cu_loglikes);  // If we don't have a GPU (and not having a
                                   // GPU is the normal expected use-case for
@@ -596,8 +598,8 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
 
 
     // OK, at this point we may have some newly created log-likes and we want to
-    // give them to the decoding thread.  
-    
+    // give them to the decoding thread.
+
     int32 num_loglike_frames = loglikes.NumRows();
 
     if (num_loglike_frames != 0) {  // if we need to output some loglikes...
@@ -644,7 +646,7 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
     }
   }
 }
-  
+
 
 bool SingleUtteranceNnet2DecoderThreaded::RunDecoderSearchInternal() {
   int32 num_frames_decoded = 0;  // this is just a copy of decoder_->NumFramesDecoded();
diff --git a/src/online2/online-nnet2-feature-pipeline.cc b/src/online2/online-nnet2-feature-pipeline.cc
index fe79dbfd114..510c401fba2 100644
--- a/src/online2/online-nnet2-feature-pipeline.cc
+++ b/src/online2/online-nnet2-feature-pipeline.cc
@@ -168,12 +168,6 @@ void OnlineNnet2FeaturePipeline::AcceptWaveform(
     pitch_->AcceptWaveform(sampling_rate, waveform);
 }
 
-void OnlineNnet2FeaturePipeline::UpdateFrameWeights(
-    const std::vector<std::pair<int32, BaseFloat> > &delta_weights) {
-  if (ivector_feature_ != NULL)
-    ivector_feature_->UpdateFrameWeights(delta_weights);
-}
-
 void OnlineNnet2FeaturePipeline::InputFinished() {
   base_feature_->InputFinished();
   if (pitch_)
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index 77746bbd634..d8f933a090d 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -52,6 +52,9 @@ namespace kaldi {
 ///
 /// Most of the logic for the actual iVector estimation is in \ref
 /// online-ivector-feature.h, this header contains mostly glue.
+///
+/// Although the name of this header mentions nnet2, actually the code is
+/// used in the online decoding with nnet3 also.
 
 
 /// This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which
@@ -74,7 +77,7 @@ struct OnlineNnet2FeaturePipelineConfig {
   // the following contains the type of options that you could give to
   // compute-and-process-kaldi-pitch-feats.
   std::string online_pitch_config;
-  
+
   // The configuration variables in ivector_extraction_config relate to the
   // iVector extractor and options related to it, see type
   // OnlineIvectorExtractionConfig.
@@ -87,7 +90,7 @@ struct OnlineNnet2FeaturePipelineConfig {
 
   OnlineNnet2FeaturePipelineConfig():
       feature_type("mfcc"), add_pitch(false) { }
-      
+
 
   void Register(OptionsItf *opts) {
     opts->Register("feature-type", &feature_type,
@@ -125,11 +128,11 @@ struct OnlineNnet2FeaturePipelineInfo {
 
   OnlineNnet2FeaturePipelineInfo(
       const OnlineNnet2FeaturePipelineConfig &config);
-  
+
   BaseFloat FrameShiftInSeconds() const;
 
   std::string feature_type;  // "mfcc" or "plp" or "fbank"
-  
+
   MfccOptions mfcc_opts;  // options for MFCC computation,
                           // if feature_type == "mfcc"
   PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
@@ -153,7 +156,7 @@ struct OnlineNnet2FeaturePipelineInfo {
   // it's the kind of thing you might want to play with directly
   // on the command line instead of inside sub-config-files.
   OnlineSilenceWeightingConfig silence_weighting_config;
-  
+
   int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
@@ -198,7 +201,7 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   /// Copy().
   void SetAdaptationState(
       const OnlineIvectorExtractorAdaptationState &adaptation_state);
-  
+
 
   /// Get the adaptation state; you may want to call this before destroying this
   /// object, to get adaptation state that can be used to improve decoding of
@@ -208,7 +211,7 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   void GetAdaptationState(
       OnlineIvectorExtractorAdaptationState *adaptation_state) const;
 
-  
+
   /// Accept more data to process.  It won't actually process it until you call
   /// GetFrame() [probably indirectly via (decoder).AdvanceDecoding()], when you
   /// call this function it will just copy it).  sampling_rate is necessary just
@@ -216,12 +219,6 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   void AcceptWaveform(BaseFloat sampling_rate,
                       const VectorBase<BaseFloat> &waveform);
 
-  /// This is used in case you are downweighting silence in the iVector
-  /// estimation using the decoder traceback.
-  void UpdateFrameWeights(
-      const std::vector<std::pair<int32, BaseFloat> > &delta_weights);
-
-
   BaseFloat FrameShiftInSeconds() const { return info_.FrameShiftInSeconds(); }
 
   /// If you call InputFinished(), it tells the class you won't be providing any
@@ -231,13 +228,28 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   /// rescoring the lattices, this may not be much of an issue.
   void InputFinished();
 
+  // This function returns the ivector-extracting part of the feature pipeline
+  // (or NULL if iVectors are not being used); the pointer is owned here and not
+  // given to the caller.  This function is used in nnet3, and also in the
+  // silence-weighting code used to exclude silence from the iVector estimation.
+  OnlineIvectorFeature *IvectorFeature() {
+    return ivector_feature_;
+  }
+
+  // This function returns the part of the feature pipeline that would be given
+  // as the primary (non-iVector) input to the neural network in nnet3
+  // applications.
+ OnlineFeatureInterface *InputFeature() {
+    return feature_plus_optional_pitch_;
+  }
+
   virtual ~OnlineNnet2FeaturePipeline();
  private:
 
   const OnlineNnet2FeaturePipelineInfo &info_;
 
   OnlineBaseFeature *base_feature_;        // MFCC/PLP/filterbank
-  
+
   OnlinePitchFeature *pitch_;              // Raw pitch, if used
   OnlineProcessPitch *pitch_feature_;  // Processed pitch, if pitch used.
 
@@ -245,15 +257,15 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   // feature_plus_pitch_ is the base_feature_ appended (OnlineAppendFeature)
   /// with pitch_feature_, if used; otherwise, points to the same address as
   /// base_feature_.
-  OnlineFeatureInterface *feature_plus_optional_pitch_;  
-  
+  OnlineFeatureInterface *feature_plus_optional_pitch_;
+
   OnlineIvectorFeature *ivector_feature_;  // iVector feature, if used.
 
   // final_feature_ is feature_plus_optional_pitch_ appended
   // (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
   // otherwise, points to the same address as feature_plus_optional_pitch_.
   OnlineFeatureInterface *final_feature_;
- 
+
   // we cache the feature dimension, to save time when calling Dim().
   int32 dim_;
 };
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index 8dd366166c0..ff74c07f10c 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -25,16 +25,17 @@
 namespace kaldi {
 
 SingleUtteranceNnet3Decoder::SingleUtteranceNnet3Decoder(
-    const OnlineNnet3DecodingConfig &config,
-    const TransitionModel &tmodel,
-    const nnet3::AmNnetSimple &am_model,
+    const LatticeFasterDecoderConfig &decoder_opts,
+    const TransitionModel &trans_model,
+    const nnet3::DecodableNnetSimpleLoopedInfo &info,
     const fst::Fst<fst::StdArc> &fst,
-    OnlineFeatureInterface *feature_pipeline):
-    config_(config),
-    feature_pipeline_(feature_pipeline),
-    tmodel_(tmodel),
-    decodable_(am_model, tmodel, config.decodable_opts, feature_pipeline),
-    decoder_(fst, config.decoder_opts) {
+    OnlineNnet2FeaturePipeline *features):
+    decoder_opts_(decoder_opts),
+    input_feature_frame_shift_in_seconds_(features->FrameShiftInSeconds()),
+    trans_model_(trans_model),
+    decodable_(trans_model_, info,
+               features->InputFeature(), features->IvectorFeature()),
+    decoder_(fst, decoder_opts_) {
   decoder_.InitDecoding();
 }
 
@@ -57,12 +58,12 @@ void SingleUtteranceNnet3Decoder::GetLattice(bool end_of_utterance,
   Lattice raw_lat;
   decoder_.GetRawLattice(&raw_lat, end_of_utterance);
 
-  if (!config_.decoder_opts.determinize_lattice)
+  if (!decoder_opts_.determinize_lattice)
     KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
 
-  BaseFloat lat_beam = config_.decoder_opts.lattice_beam;
+  BaseFloat lat_beam = decoder_opts_.lattice_beam;
   DeterminizeLatticePhonePrunedWrapper(
-      tmodel_, &raw_lat, lat_beam, clat, config_.decoder_opts.det_opts);
+      trans_model_, &raw_lat, lat_beam, clat, decoder_opts_.det_opts);
 }
 
 void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
@@ -72,12 +73,12 @@ void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
 
 bool SingleUtteranceNnet3Decoder::EndpointDetected(
     const OnlineEndpointConfig &config) {
-  int32 subsample = decodable_.FrameSubsamplingFactor();
-  return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_->FrameShiftInSeconds() * subsample,
-                                 decoder_);  
+  BaseFloat output_frame_shift =
+      input_feature_frame_shift_in_seconds_ *
+      decodable_.FrameSubsamplingFactor();
+  return kaldi::EndpointDetected(config, trans_model_,
+                                 output_frame_shift, decoder_);
 }
 
 
 }  // namespace kaldi
-
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index 788c713080b..1888b71dbf1 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -26,12 +26,13 @@
 #include <vector>
 #include <deque>
 
-#include "nnet3/online-nnet3-decodable-simple.h"
+#include "nnet3/decodable-online-looped.h"
 #include "matrix/matrix-lib.h"
 #include "util/common-utils.h"
 #include "base/kaldi-error.h"
 #include "itf/online-feature-itf.h"
 #include "online2/online-endpoint.h"
+#include "online2/online-nnet2-feature-pipeline.h"
 #include "decoder/lattice-faster-online-decoder.h"
 #include "hmm/transition-model.h"
 #include "hmm/posterior.h"
@@ -41,40 +42,21 @@ namespace kaldi {
 /// @{
 
 
-
-
-
-// This configuration class contains the configuration classes needed to create
-// the class SingleUtteranceNnet3Decoder.  The actual command line program
-// requires other configs that it creates separately, and which are not included
-// here: namely, OnlineNnet2FeaturePipelineConfig and OnlineEndpointConfig.
-struct OnlineNnet3DecodingConfig {
-  
-  LatticeFasterDecoderConfig decoder_opts;
-  nnet3::DecodableNnet3OnlineOptions decodable_opts;
-  
-  OnlineNnet3DecodingConfig() {  decodable_opts.acoustic_scale = 0.1; }
-  
-  void Register(OptionsItf *opts) {
-    decoder_opts.Register(opts);
-    decodable_opts.Register(opts);
-  }
-};
-
 /**
    You will instantiate this class when you want to decode a single
    utterance using the online-decoding setup for neural nets.
 */
 class SingleUtteranceNnet3Decoder {
  public:
-  // Constructor.  The feature_pipeline_ pointer is not owned in this
-  // class, it's owned externally.
-  SingleUtteranceNnet3Decoder(const OnlineNnet3DecodingConfig &config,
-                              const TransitionModel &tmodel,
-                              const nnet3::AmNnetSimple &am_model,
+
+  // Constructor. The pointer 'features' is not being given to this class to own
+  // and deallocate, it is owned externally.
+  SingleUtteranceNnet3Decoder(const LatticeFasterDecoderConfig &decoder_opts,
+                              const TransitionModel &trans_model,
+                              const nnet3::DecodableNnetSimpleLoopedInfo &info,
                               const fst::Fst<fst::StdArc> &fst,
-                              OnlineFeatureInterface *feature_pipeline);
-  
+                              OnlineNnet2FeaturePipeline *features);
+
   /// advance the decoding as far as we can.
   void AdvanceDecoding();
 
@@ -84,7 +66,7 @@ class SingleUtteranceNnet3Decoder {
   void FinalizeDecoding();
 
   int32 NumFramesDecoded() const;
-  
+
   /// Gets the lattice.  The output lattice has any acoustic scaling in it
   /// (which will typically be desirable in an online-decoding context); if you
   /// want an un-scaled lattice, scale it using ScaleLattice() with the inverse
@@ -92,7 +74,7 @@ class SingleUtteranceNnet3Decoder {
   /// final-probs to be included.
   void GetLattice(bool end_of_utterance,
                   CompactLattice *clat) const;
-  
+
   /// Outputs an FST corresponding to the single best path through the current
   /// lattice. If "use_final_probs" is true AND we reached the final-state of
   /// the graph then it will include those as final-probs, else it will treat
@@ -106,23 +88,27 @@ class SingleUtteranceNnet3Decoder {
   bool EndpointDetected(const OnlineEndpointConfig &config);
 
   const LatticeFasterOnlineDecoder &Decoder() const { return decoder_; }
-  
+
   ~SingleUtteranceNnet3Decoder() { }
  private:
 
-  OnlineNnet3DecodingConfig config_;
+  const LatticeFasterDecoderConfig &decoder_opts_;
+
+  // this is remembered from the constructor; it's ultimately
+  // derived from calling FrameShiftInSeconds() on the feature pipeline.
+  BaseFloat input_feature_frame_shift_in_seconds_;
 
-  OnlineFeatureInterface *feature_pipeline_;
+  // we need to keep a reference to the transition model around only because
+  // it's needed by the endpointing code.
+  const TransitionModel &trans_model_;
+
+  nnet3::DecodableAmNnetLoopedOnline decodable_;
 
-  const TransitionModel &tmodel_;
-  
-  nnet3::DecodableNnet3SimpleOnline decodable_;
-  
   LatticeFasterOnlineDecoder decoder_;
-  
+
 };
 
-  
+
 /// @} End of "addtogroup onlinedecoding"
 
 }  // namespace kaldi
diff --git a/src/online2bin/online2-wav-nnet2-latgen-faster.cc b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
index ad8f323aea1..08e2c64995a 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
@@ -40,10 +40,10 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   }
   CompactLattice best_path_clat;
   CompactLatticeShortestPath(clat, &best_path_clat);
-  
+
   Lattice best_path_lat;
   ConvertLattice(best_path_clat, &best_path_lat);
-  
+
   double likelihood;
   LatticeWeight weight;
   int32 num_frames;
@@ -57,7 +57,7 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                 << (likelihood / num_frames) << " over " << num_frames
                 << " frames.";
-             
+
   if (word_syms != NULL) {
     std::cerr << utt << ' ';
     for (size_t i = 0; i < words.size(); i++) {
@@ -76,10 +76,10 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;
-    
+
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
-    
+
     const char *usage =
         "Reads in wav file(s) and simulates online decoding with neural nets\n"
         "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
@@ -92,22 +92,22 @@ int main(int argc, char *argv[]) {
         "you want to decode utterance by utterance.\n"
         "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
         "See also online2-wav-nnet2-latgen-threaded\n";
-    
+
     ParseOptions po(usage);
-    
+
     std::string word_syms_rxfilename;
-    
+
     OnlineEndpointConfig endpoint_config;
 
     // feature_config includes configuration for the iVector adaptation,
     // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;  
+    OnlineNnet2FeaturePipelineConfig feature_config;
     OnlineNnet2DecodingConfig nnet2_decoding_config;
 
     BaseFloat chunk_length_secs = 0.05;
     bool do_endpointing = false;
     bool online = true;
-    
+
     po.Register("chunk-length", &chunk_length_secs,
                 "Length of chunk size in seconds, that we process.  Set to <= 0 "
                 "to use all input in one chunk.");
@@ -126,24 +126,24 @@ int main(int argc, char *argv[]) {
                 "--chunk-length=-1.");
     po.Register("num-threads-startup", &g_num_threads,
                 "Number of threads used when initializing iVector extractor.");
-    
+
     feature_config.Register(&po);
     nnet2_decoding_config.Register(&po);
     endpoint_config.Register(&po);
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 5) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string nnet2_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         spk2utt_rspecifier = po.GetArg(3),
         wav_rspecifier = po.GetArg(4),
         clat_wspecifier = po.GetArg(5);
-    
+
     OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
 
     if (!online) {
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
-    
+
     TransitionModel trans_model;
     nnet2::AmNnet nnet;
     {
@@ -160,25 +160,25 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
       nnet.Read(ki.Stream(), binary);
     }
-    
+
     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldi(fst_rxfilename);
-    
+
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_rxfilename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;
-    
+
     int32 num_done = 0, num_err = 0;
     double tot_like = 0.0;
     int64 num_frames = 0;
-    
+
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     CompactLatticeWriter clat_writer(clat_wspecifier);
-    
+
     OnlineTimingStats timing_stats;
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -202,14 +202,14 @@ int main(int argc, char *argv[]) {
         OnlineSilenceWeighting silence_weighting(
             trans_model,
             feature_info.silence_weighting_config);
-        
+
         SingleUtteranceNnet2Decoder decoder(nnet2_decoding_config,
                                             trans_model,
                                             nnet,
                                             *decode_fst,
                                             &feature_pipeline);
         OnlineTimer decoding_timer(utt);
-        
+
         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         if (chunk_length_secs > 0) {
@@ -218,15 +218,15 @@ int main(int argc, char *argv[]) {
         } else {
           chunk_length = std::numeric_limits<int32>::max();
         }
-        
+
         int32 samp_offset = 0;
         std::vector<std::pair<int32, BaseFloat> > delta_weights;
-        
+
         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;
-          
+
           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
           feature_pipeline.AcceptWaveform(samp_freq, wave_part);
 
@@ -236,16 +236,19 @@ int main(int argc, char *argv[]) {
             // no more input. flush out last frames
             feature_pipeline.InputFinished();
           }
-    
-          if (silence_weighting.Active()) {
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
             silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
-            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
-                                              &delta_weights);
-            feature_pipeline.UpdateFrameWeights(delta_weights);
+            silence_weighting.GetDeltaWeights(
+                feature_pipeline.IvectorFeature()->NumFramesReady(),
+                &delta_weights);
+            feature_pipeline.IvectorFeature()->UpdateFrameWeights(
+                delta_weights);
           }
-          
+
           decoder.AdvanceDecoding();
-          
+
           if (do_endpointing && decoder.EndpointDetected(endpoint_config))
             break;
         }
@@ -254,16 +257,16 @@ int main(int argc, char *argv[]) {
         CompactLattice clat;
         bool end_of_utterance = true;
         decoder.GetLattice(end_of_utterance, &clat);
-        
+
         GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
                                      &num_frames, &tot_like);
-        
+
         decoding_timer.OutputStats(&timing_stats);
-        
+
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
-        
+
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
             1.0 / nnet2_decoding_config.decodable_opts.acoustic_scale;
@@ -275,7 +278,7 @@ int main(int argc, char *argv[]) {
       }
     }
     timing_stats.Print(online);
-    
+
     KALDI_LOG << "Decoded " << num_done << " utterances, "
               << num_err << " with errors.";
     KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
diff --git a/src/online2bin/online2-wav-nnet3-latgen-faster.cc b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
index 740c9e2221b..62204460159 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
@@ -41,10 +41,10 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   }
   CompactLattice best_path_clat;
   CompactLatticeShortestPath(clat, &best_path_clat);
-  
+
   Lattice best_path_lat;
   ConvertLattice(best_path_clat, &best_path_lat);
-  
+
   double likelihood;
   LatticeWeight weight;
   int32 num_frames;
@@ -58,7 +58,7 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                 << (likelihood / num_frames) << " over " << num_frames
                 << " frames.";
-             
+
   if (word_syms != NULL) {
     std::cerr << utt << ' ';
     for (size_t i = 0; i < words.size(); i++) {
@@ -77,10 +77,10 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;
-    
+
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
-    
+
     const char *usage =
         "Reads in wav file(s) and simulates online decoding with neural nets\n"
         "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
@@ -91,22 +91,22 @@ int main(int argc, char *argv[]) {
         "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
         "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
         "you want to decode utterance by utterance.\n";
-    
+
     ParseOptions po(usage);
-    
+
     std::string word_syms_rxfilename;
-    
-    OnlineEndpointConfig endpoint_config;
 
-    // feature_config includes configuration for the iVector adaptation,
+    // feature_opts includes configuration for the iVector adaptation,
     // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;
-    OnlineNnet3DecodingConfig nnet3_decoding_config;
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeFasterDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
 
     BaseFloat chunk_length_secs = 0.18;
     bool do_endpointing = false;
     bool online = true;
-    
+
     po.Register("chunk-length", &chunk_length_secs,
                 "Length of chunk size in seconds, that we process.  Set to <= 0 "
                 "to use all input in one chunk.");
@@ -125,32 +125,34 @@ int main(int argc, char *argv[]) {
                 "--chunk-length=-1.");
     po.Register("num-threads-startup", &g_num_threads,
                 "Number of threads used when initializing iVector extractor.");
-    
-    feature_config.Register(&po);
-    nnet3_decoding_config.Register(&po);
-    endpoint_config.Register(&po);
-    
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 5) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string nnet3_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         spk2utt_rspecifier = po.GetArg(3),
         wav_rspecifier = po.GetArg(4),
         clat_wspecifier = po.GetArg(5);
-    
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
 
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
-    
+
     TransitionModel trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
@@ -159,25 +161,32 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
       am_nnet.Read(ki.Stream(), binary);
     }
-    
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+
     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldi(fst_rxfilename);
-    
+
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_rxfilename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;
-    
+
     int32 num_done = 0, num_err = 0;
     double tot_like = 0.0;
     int64 num_frames = 0;
-    
+
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     CompactLatticeWriter clat_writer(clat_wspecifier);
-    
+
     OnlineTimingStats timing_stats;
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -201,14 +210,12 @@ int main(int argc, char *argv[]) {
         OnlineSilenceWeighting silence_weighting(
             trans_model,
             feature_info.silence_weighting_config);
-        
-        SingleUtteranceNnet3Decoder decoder(nnet3_decoding_config,
-                                            trans_model,
-                                            am_nnet,
-                                            *decode_fst,
-                                            &feature_pipeline);
+
+        SingleUtteranceNnet3Decoder decoder(decoder_opts, trans_model,
+                                            decodable_info,
+                                            *decode_fst, &feature_pipeline);
         OnlineTimer decoding_timer(utt);
-        
+
         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         if (chunk_length_secs > 0) {
@@ -217,15 +224,15 @@ int main(int argc, char *argv[]) {
         } else {
           chunk_length = std::numeric_limits<int32>::max();
         }
-        
+
         int32 samp_offset = 0;
         std::vector<std::pair<int32, BaseFloat> > delta_weights;
-        
+
         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;
-          
+
           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
           feature_pipeline.AcceptWaveform(samp_freq, wave_part);
 
@@ -235,17 +242,18 @@ int main(int argc, char *argv[]) {
             // no more input. flush out last frames
             feature_pipeline.InputFinished();
           }
-    
-          if (silence_weighting.Active()) {
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
             silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
             silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
                                               &delta_weights);
-            feature_pipeline.UpdateFrameWeights(delta_weights);
+            feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights);
           }
-          
+
           decoder.AdvanceDecoding();
-          
-          if (do_endpointing && decoder.EndpointDetected(endpoint_config))
+
+          if (do_endpointing && decoder.EndpointDetected(endpoint_opts))
             break;
         }
         decoder.FinalizeDecoding();
@@ -253,19 +261,19 @@ int main(int argc, char *argv[]) {
         CompactLattice clat;
         bool end_of_utterance = true;
         decoder.GetLattice(end_of_utterance, &clat);
-        
+
         GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
                                      &num_frames, &tot_like);
-        
+
         decoding_timer.OutputStats(&timing_stats);
-        
+
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
-        
+
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
-            1.0 / nnet3_decoding_config.decodable_opts.acoustic_scale;
+            1.0 / decodable_opts.acoustic_scale;
         ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
 
         clat_writer.Write(utt, clat);
@@ -274,7 +282,7 @@ int main(int argc, char *argv[]) {
       }
     }
     timing_stats.Print(online);
-    
+
     KALDI_LOG << "Decoded " << num_done << " utterances, "
               << num_err << " with errors.";
     KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)

From a2e38fa7886b2059014c9f242dd91e3f9783e30f Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Fri, 10 Feb 2017 17:07:34 -0500
Subject: [PATCH 145/213] [scripts] Checking ivector extractor id, handle cases
 when ivector ids do not exist (#1417)

---
 egs/wsj/s5/steps/libs/common.py          | 5 +++++
 egs/wsj/s5/steps/nnet2/get_ivector_id.sh | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index 66a02062e9c..9d01fae3027 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -299,6 +299,10 @@ def get_ivector_extractor_id(ivector_dir=None):
         return None
     [stdout_val, stderr_val] = run_kaldi_command(
         "steps/nnet2/get_ivector_id.sh {dir}".format(dir=ivector_dir))
+
+    if (stdout_val.strip() == "") or (stdout_val is None):
+        return None
+
     return stdout_val.strip()
 
 def get_feat_dim(feat_dir):
@@ -409,3 +413,4 @@ def write_idct_matrix(feat_dim, cepstral_lifter, file_path):
     for k in range(0, feat_dim):
         idct_matrix[k].append(0)
     write_kaldi_matrix(file_path, idct_matrix)
+
diff --git a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
index d7be853349d..3ec70757d5a 100755
--- a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
+++ b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
@@ -33,7 +33,7 @@ elif [ -f $ivecdir/final.ie ] ; then
   echo "$id" > $ivecdir/final.ie.id || exit 1
   cat $ivecdir/final.ie.id
 else
-  exit 1
+  exit 0
 fi
 
 exit 0

From 4226d50168ff55e641fed62c6a21ff02640c04ec Mon Sep 17 00:00:00 2001
From: Ke Li <kli26@jhu.edu>
Date: Sat, 11 Feb 2017 14:28:25 -0500
Subject: [PATCH 146/213] [scripts] add empty-data checks in generate_plots.py
 (#1394)

---
 egs/wsj/s5/steps/nnet3/report/generate_plots.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index dddef38573e..233091f8058 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -220,6 +220,10 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
     for dir in dirs:
         stats_per_component_per_iter = (
             log_parse.parse_progress_logs_for_nonlinearity_stats(dir))
+        for key in stats_per_component_per_iter:
+            if len(stats_per_component_per_iter[key]['stats']) == 0:
+                logger.warning("Couldn't find any rows for the"
+                               "nonlin stats plot, not generating it")
         stats_per_dir[dir] = stats_per_component_per_iter
 
     # convert the nonlin stats into tables
@@ -350,6 +354,9 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
                           " this might be because there are no "
                           "ClipGradientComponents.".format(dir))
             continue
+        if len(stats_per_dir[dir]) == 0: 
+            logger.warning("Couldn't find any rows for the"
+                           "clipped proportion plot, not generating it")
     try:
         main_cp_stats = stats_per_dir[exp_dir]['table']
     except KeyError:

From 288cf484348231c5b0dcf5e7d67efed4529867cd Mon Sep 17 00:00:00 2001
From: LvHang <hanglv@nwpu-aslp.org>
Date: Sat, 11 Feb 2017 14:34:03 -0500
Subject: [PATCH 147/213] [scripts] Change how the --frame argument is set in
 non-recurrent DNN training (#1389)

... makes it vary on each iteration, not in big chunks of time.
---
 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 25fd94d98ff..3e732313612 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -71,7 +71,7 @@ def train_new_models(dir, iter, srand, num_jobs,
         archive_index = (k % num_archives) + 1
 
         if not chunk_level_training:
-            frame = (k / num_archives) % frames_per_eg
+            frame = (k / num_archives + archive_index) % frames_per_eg
 
         cache_write_opt = ""
         if job == 1:

From bd629d1d45d4e7d84ed1f0c7be4dc1b62f7b335b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 13 Feb 2017 15:22:31 -0500
Subject: [PATCH 148/213] [scripts,build]: minor fixes only affecting error
 handling.

---
 egs/wsj/s5/utils/validate_lang.pl | 4 +++-
 src/base/get_version.sh           | 2 +-
 tools/config/common_path.sh       | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index 008c54ac752..e5bdf75787e 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -758,8 +758,10 @@ sub check_summation {
       # prepare_lang.sh), the regular L.fst may contain some disambiguation
       # symbols.
       if (! defined $is_disambig{$phone}) {
-        if ($phone == "<<eos>>") {
+        if ($phone eq "<<eos>>") {
           $state = "eos";
+        } else if ($phone == 0) {
+          $exit = 1; print "--> ERROR: unexpected phone sequence=$phoneseq, wordseq=$wordseq\n"; last;
         } else {
           $state = $wbtype{$phone};
         }
diff --git a/src/base/get_version.sh b/src/base/get_version.sh
index bf5efa8c14a..4829391ac44 100755
--- a/src/base/get_version.sh
+++ b/src/base/get_version.sh
@@ -59,7 +59,7 @@ else
   version="$version.$patch_number"
 
   # Check for uncommitted changes in src/.
-  uncommitted_changes=$(git diff-index HEAD .. | wc -l)
+  uncommitted_changes=$(git diff-index HEAD -- .. | wc -l)
   if [ $uncommitted_changes -gt 0 ]; then
     # Add suffix ~N if there are N files in src/ with uncommitted changes
     version="$version~$uncommitted_changes"
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 3e2ea50d685..fbc4b674474 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -1,5 +1,5 @@
-# we assume KALDI_ROOT is already defined 
-[ -z "$KALDI_ROOT" ] && echo "The variable KALDI_ROOT must be already defined" && exit 1
+# we assume KALDI_ROOT is already defined
+[ -z "$KALDI_ROOT" ] && echo >&2 "The variable KALDI_ROOT must be already defined" && exit 1
 # The formatting of the path export command is intentionally weird, because
 # this allows for easy diff'ing
 export PATH=\

From 6e69654ad54a5ab6b427287221e016e70c7c2666 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 11 Feb 2017 21:24:21 -0500
Subject: [PATCH 149/213] [src,doc] Documentation updates; fixes to comments.

---
 src/cudamatrix/cu-matrix.h  |  9 ++-------
 src/doc/online_decoding.dox | 30 ++++++++++++++++++++++++++++++
 src/doc/versions.dox        |  2 ++
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index fb26fbf1013..056abb0c8fb 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -96,7 +96,6 @@ class CuMatrixBase {
   /// Copies column r from column indexes[r] of src.
   /// As a special case, if indexes[i] == -1, sets column i to zero
   /// indexes.size() must equal this->NumCols(),
-  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void CopyCols(const CuMatrixBase<Real> &src,
                 const CuArray<MatrixIndexT> &indexes);
@@ -105,14 +104,12 @@ class CuMatrixBase {
   /// Add column indices[r] of src to column r.
   /// As a special case, if indexes[i] == -1, skip column i
   /// indices.size() must equal this->NumCols(),
-  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void AddCols(const CuMatrixBase<Real> &src,
                const CuArray<MatrixIndexT> &indices);
 
   /// Copies row r from row indexes[r] of src.
-  /// As a special case, if indexes[i] < 0, sets row i to zero
-  /// "reorder".size() must equal this->NumRows(), and
+  /// As a special case, if indexes[i] < 0, sets row i to zero.
   /// src.NumCols() must equal this.NumCols()
   void CopyRows(const CuMatrixBase<Real> &src,
                 const CuArray<MatrixIndexT> &indexes);
@@ -136,9 +133,7 @@ class CuMatrixBase {
 
   /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
   /// If indexes[r] < 0, does not add anything.
-  /// "reorder".size() must equal this->NumRows(),
-  /// all elements of "reorder" must be in [0, src.NumRows()-1],
-  /// and src.NumCols() must equal this.NumCols()
+  /// src.NumCols() must equal this.NumCols()
   void AddRows(Real alpha,
                const CuMatrixBase<Real> &src,
                const CuArray<MatrixIndexT> &indexes);
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index 52be3d38bca..799bfb5895f 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -410,6 +410,36 @@ utils/mkgraph.sh $lang_own $model_dir $graph_own_dir || exit 1;
 where $model_dir is the model directory which contains the model "final.mdl"
 and the tree "tree". We now can use $graph_own_dir/HCLG.fst to replace the old
 HCLG.fst.
+
+
+\section online_decoding_nnet3 Online decoding with nnet3 models
+
+Online decoding with nnet3 models is basically the same as with nnet2
+models as described in \ref online_decoding_nnet2.  However, there are
+some limitations as to the model type you can use.  In Kaldi 5.0 and
+earlier, online nnet3 decoding does not support recurrent models.
+In Kaldi 5.1 and later, online nnet3 decoding supports "forward"
+recurrent models such as LSTMs, but not bidirectional ones like BLSTMs.
+In addition, online nnet3 decoding with recurrent
+models may not give optimal results unless
+you use "Kaldi-5.1-style" configuration, including the "decay-time"
+option and specifying --extra-left-context-initial 0; see
+\ref dnn3_scripts_context for more discussions of these issues.
+
+
+Many of the issues in online nnet3 decoding are the same as in nnet2
+decoding and the command lines are quite similar.  For online nnet3
+decoding with Kaldi 5.1 and later, the best example script for online
+decoding including model training is, at the
+time of writing, egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+(at the time of writing this is only available in the 'shortcut' branch,
+as Kaldi 5.1 has not yet been merged to master);
+and downloadable models that can be used with online nnet3 decoding, please
+see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
+includes instructions in a README file).
+
+
+
 */
 
 
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index 2c67b2de317..56cdcdf4118 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -85,6 +85,8 @@
        in nnet3; this allows faster and more-easily-online decoding for
        recurrent setups (but only unidirectionally-recurrent ones, like LSTMs
        but not BLSTMs).
+     - \ref online_decoding_nnet3 is now rewritten; it's faster and it supports
+       models like LSTMs.
      - The sequence-training scripts in nnet3 are refactored and are now simpler
        and use less disk space.
 

From 476c45d2d15a51b3db2b1003e874f3c0d52e9ab7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 13 Feb 2017 23:37:31 -0500
Subject: [PATCH 150/213] [scripts] Removing tdnn-*-layer from xconfigs
 (redundant).

---
 .../s5c/local/chain/tuning/run_tdnn_7j.sh     |  12 +-
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   4 -
 egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py   | 107 ------------------
 src/nnet3/nnet-utils.h                        |   2 +-
 4 files changed, 7 insertions(+), 118 deletions(-)
 delete mode 100644 egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
index 9aec95393d1..793b40f7fe3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -126,12 +126,12 @@ if [ $stage -le 12 ]; then
 
   # the first splicing is moved before the lda layer, so no splicing here
   relu-renorm-layer name=tdnn1 dim=768
-  tdnn-relu-renorm-layer name=tdnn2 splice-indexes=-1,0,1 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn3 splice-indexes=-1,0,1 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn4 splice-indexes=-3,0,3 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn5 splice-indexes=-3,0,3 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn6 splice-indexes=-3,0,3 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn7 splice-indexes=-3,0,3 dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=768 subset-dim=384
 
   ## adding the layers for chain branch
   relu-renorm-layer name=prefinal-chain input=tdnn7 dim=768 target-rms=0.5
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 89458c65152..918d8bd2fb2 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -22,10 +22,6 @@
         'relu-renorm-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
         'tanh-layer' : xlayers.XconfigBasicLayer,
-        'tdnn-relu-layer' : xlayers.XconfigTdnnLayer,
-        'tdnn-relu-renorm-layer' : xlayers.XconfigTdnnLayer,
-        'tdnn-sigmoid-layer' : xlayers.XconfigTdnnLayer,
-        'tdnn-tanh-layer' : xlayers.XconfigTdnnLayer,
         'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
         'affine-layer' : xlayers.XconfigAffineLayer,
         'lstm-layer' : xlayers.XconfigLstmLayer,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py
deleted file mode 100644
index ed7b6f1f53c..00000000000
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2016    Johns Hopkins University (Dan Povey)
-#           2016    Vijayaditya Peddinti
-# Apache 2.0.
-
-
-""" This module contains the implementation of the TDNN layer.
-"""
-
-import libs.nnet3.xconfig.utils as xutils
-from libs.nnet3.xconfig.basic_layers import XconfigBasicLayer
-from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
-
-class XconfigTdnnLayer(XconfigBasicLayer):
-    """This class is for parsing lines like
-    tdnn-relu-renorm-layer name=tdnn1 dim=1024 splice-indexes=-3,0,3 subset-dim=512
-
-    It is similar to XconfigBasicLayer except for the way in which the input
-    splicing is done. So we derive this class from XconfigBasicLayer.
-    """
-
-    def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token in [ 'tdnn-relu-layer', 'tdnn-relu-renorm-layer',
-                                'tdnn-sigmoid-layer', 'tdnn-tanh-layer' ]
-        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
-
-
-    def set_default_configs(self):
-
-        super(XconfigTdnnLayer, self).set_default_configs()
-
-        self.config['splice-indexes'] = ''
-        self.config['subset-dim'] = -1
-
-    def check_configs(self):
-
-        if self.config['splice-indexes'] == '':
-            raise RuntimeError("splice-indexes must be non-empty")
-        super(XconfigTdnnLayer, self).check_configs()
-
-
-    def _generate_config(self):
-        split_layer_name = self.layer_type.split('-')
-        assert split_layer_name[-1] == 'layer'
-        # ignore the first 'tdnn' and the last 'layer'
-        nonlinearities = split_layer_name[1:-1]
-
-        # by 'descriptor_final_string' we mean a string that can appear in
-        # config-files, i.e. it contains the 'final' names of nodes.
-        input_desc = self.descriptors['input']['final-string']
-        input_dim = self.descriptors['input']['dim']
-        splice_indexes = self.get_splice_indexes()
-        input_desc, input_dim, sp_configs = self.splice_input(input_desc,
-                input_dim, splice_indexes, self.config['subset-dim'],
-                '{0}.input-subset'.format(self.name))
-
-        return sp_configs + self._add_components(input_desc, input_dim, nonlinearities)
-
-    def get_splice_indexes(self):
-        try:
-            return map(lambda x: int(x), self.config['splice-indexes'].split(","))
-        except ValueError:
-            raise RuntimeError("Invalid value for splice-indexes.")
-
-    @staticmethod
-    def splice_input(input_desc, input_dim,
-                     splice_indexes, subset_dim = -1,
-                     dim_range_node_name = None ):
-        """Convenience function to create an appended descriptor with the
-        splice_indexes.
-        """
-
-        configs = []
-        try:
-            zero_index = splice_indexes.index(0)
-        except ValueError:
-            zero_index = None
-
-        if subset_dim > 0:
-            assert(dim_range_node_name is not None)
-            # if subset_dim is specified the script expects a zero
-            # in the splice indexes
-            assert(zero_index is not None)
-            line = ("dim-range-node name={0}"
-                    " input-node={1}"
-                    " dim-offset={2}"
-                    " dim={3}"
-                    "".format(dim_range_node_name,
-                              input_desc, 0, subset_dim))
-            configs.append(line)
-            subset_desc = dim_range_node_name
-
-        else:
-            subset_desc = input_desc
-            subset_dim = input_dim
-
-        appended_descriptors = []
-        appended_dimension = 0
-        for j in range(len(splice_indexes)):
-            if j == zero_index:
-                appended_descriptors.append(input_desc)
-                appended_dimension += input_dim
-                continue
-            appended_descriptors.append('Offset({0}, {1})'.format(subset_desc, splice_indexes[j]))
-            appended_dimension += subset_dim
-        return ["Append({0})".format(", ".join(appended_descriptors)),
-                appended_dimension,
-                configs]
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 3bda01271d2..766b0ed1798 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -80,7 +80,7 @@ std::string PrintVectorPerUpdatableComponent(const Nnet &nnet,
                                              const VectorBase<BaseFloat> &vec);
 
 /// This function returns true if the nnet has the following properties:
-///  It has an called "output" (other outputs are allowed but may be
+///  It has an output called "output" (other outputs are allowed but may be
 ///          ignored).
 ///  It has an input called "input", and possibly an extra input called
 ///    "ivector", but no other inputs.

From 886e45ad68c4f532e85fbf8bf268d05a306ef846 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Thu, 16 Feb 2017 08:17:47 +0800
Subject: [PATCH 151/213] [src] Add element-wise matrix min operation (#1424)

---
 src/cudamatrix/cu-kernels-ansi.h |  5 ++++-
 src/cudamatrix/cu-kernels.cu     | 24 ++++++++++++++++++++++++
 src/cudamatrix/cu-kernels.h      | 24 ++++++++++++++++--------
 src/cudamatrix/cu-matrix-test.cc | 22 ++++++++++++++++++++++
 src/cudamatrix/cu-matrix.cc      | 25 +++++++++++++++++++++++++
 src/cudamatrix/cu-matrix.h       |  2 ++
 src/matrix/kaldi-matrix.cc       | 13 +++++++++++++
 src/matrix/kaldi-matrix.h        |  2 ++
 8 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 116428ea82c..9c274283b7e 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -105,6 +105,8 @@ void cudaF_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
                         MatrixDim dst_d, int src_stride);
 void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d,
                int src_stride);
+void cudaF_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
+               MatrixDim mat_d, int other_stride);
 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
                         MatrixDim d);
 void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
@@ -373,6 +375,8 @@ void cudaD_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
                         MatrixDim dst_d, int src_stride);
 void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d,
                int src_stride);
+void cudaD_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
+               MatrixDim mat_d, int other_stride);
 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
                         MatrixDim d);
 void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
@@ -696,7 +700,6 @@ void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                   float* self_repair_sum_out,
                                   const int self_repair_sum_out_stride);
 
-
 void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
                               MatrixDim d_out, const double *v_in);
 void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index abb4efd47ef..a1a1e6c633b 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -385,6 +385,20 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) {
   }
 }
 
+template<typename Real>
+__global__
+static void _min(Real* mat, const Real* other, MatrixDim mat_d,
+                 int other_stride) {
+  int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda i = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda mat_index = i * mat_d.stride + j;
+  int32_cuda other_index = i * other_stride + j;
+  if (j < mat_d.cols && i < mat_d.rows) {
+    Real a = mat[mat_index], b = other[other_index];
+    mat[mat_index] = fmin(a, b);
+  }
+}
+
 template<typename Real>
 __global__
 static void _vec_mul_elements(Real* v, const Real* a, int dim) {
@@ -3350,6 +3364,11 @@ void cudaF_max(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d,
   _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
+void cudaF_min(dim3 Gr, dim3 Bl, float* mat, const float* other,
+               MatrixDim mat_d, int other_stride) {
+  _min<<<Gr,Bl>>>(mat,other,mat_d,other_stride);
+}
+
 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale,
                         MatrixDim d) {
   _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d);
@@ -3999,6 +4018,11 @@ void cudaD_max(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d,
   _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
+void cudaD_min(dim3 Gr, dim3 Bl, double* mat, const double* other, MatrixDim mat_d,
+               int other_stride) {
+  _min<<<Gr,Bl>>>(mat,other,mat_d,other_stride);
+}
+
 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale,
                         MatrixDim d) {
   _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d);
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 649a25ab67e..bc0f170043d 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -297,6 +297,10 @@ inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A,
                      MatrixDim dst_d, int src_stride) {
   cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
 }
+inline void cuda_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
+                     MatrixDim mat_d, int other_stride) {
+  cudaF_min(Gr, Bl, mat, other, mat_d, other_stride);
+}
 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
                               MatrixDim d) {
   cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
@@ -548,15 +552,15 @@ inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
   cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
-                                 MatrixDim d, int src_stride,
-                                 const float *a, const float *b) {
-  cudaF_parametric_relu(Gr,Bl,y,x,d,src_stride,a,b);
+                                 MatrixDim d, int src_stride, const float *a,
+                                 const float *b) {
+  cudaF_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
 }
 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout,
                                       const float *e, const float *y,
                                       MatrixDim d, int e_stride, int y_stride,
                                       const float *a, const float *b) {
-  cudaF_diff_parametric_relu(Gr,Bl,eout,e,y,d,e_stride,y_stride,a,b);
+  cudaF_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
 }
 inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x,
                            MatrixDim d, int src_stride) {
@@ -837,6 +841,10 @@ inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A,
                      MatrixDim dst_d, int src_stride) {
   cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
 }
+inline void cuda_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
+                     MatrixDim mat_d, int other_stride) {
+  cudaD_min(Gr, Bl, mat, other, mat_d, other_stride);
+}
 inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat,
                               const double *scale, MatrixDim d) {
   cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
@@ -1093,15 +1101,15 @@ inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
   cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
 inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
-                                 MatrixDim d, int src_stride,
-                                 const double *a, const double *b) {
-  cudaD_parametric_relu(Gr,Bl,y,x,d,src_stride,a,b);
+                                 MatrixDim d, int src_stride, const double *a,
+                                 const double *b) {
+  cudaD_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
 }
 inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout,
                                       const double *e, const double *y,
                                       MatrixDim d, int e_stride, int y_stride,
                                       const double *a, const double *b) {
-  cudaD_diff_parametric_relu(Gr,Bl,eout,e,y,d,e_stride,y_stride,a,b);
+  cudaD_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
 }
 inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x,
                            MatrixDim d, int src_stride) {
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 38c800d8e58..6d172a36954 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -870,6 +870,27 @@ static void UnitTestCuMatrixMax() {
   AssertEqual(Ha,Ha2);
 }
 
+template<typename Real>
+static void UnitTestCuMatrixMin() {
+  Matrix<Real> Ha(100,100);
+  Matrix<Real> Hb(100,100);
+  Ha.SetRandn();
+  Hb.SetRandn();
+
+  CuMatrix<Real> Da(100,100);
+  CuMatrix<Real> Db(100,100);
+  Da.CopyFromMat(Ha);
+  Db.CopyFromMat(Hb);
+
+  Da.Min(Db);
+  Ha.Min(Hb);
+
+  Matrix<Real> Ha2(100,100);
+  Da.CopyToMat(&Ha2);
+
+  AssertEqual(Ha, Ha2);
+}
+
 
 
 template<typename Real>
@@ -2620,6 +2641,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixMulElements<Real>();
   UnitTestCuMatrixDivElements<Real>();
   UnitTestCuMatrixMax<Real>();
+  UnitTestCuMatrixMin<Real>();
   UnitTestCuMatrixMulColsVec<Real>();
   UnitTestCuMatrixMulRowsVec<Real>();
   UnitTestCuMatrixDivRowsVec<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 652364f3dc8..cfa570233c3 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -721,6 +721,31 @@ void CuMatrixBase<Real>::Max(const CuMatrixBase<Real>& A) {
 }
 
 
+template<typename Real>
+void CuMatrixBase<Real>::Min(const CuMatrixBase<Real>& A) {
+  #if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+
+    KALDI_ASSERT(num_cols_ == A.NumCols());
+    KALDI_ASSERT(num_rows_ == A.NumRows());
+
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+
+    cuda_min(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+  #endif
+  {
+    Mat().Min(A.Mat());
+  }
+}
+
+
 template<typename Real>
 void CuMatrixBase<Real>::MulColsVec(const CuVectorBase<Real> &scale) {
 #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 056abb0c8fb..0a4c4b0669e 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -405,6 +405,8 @@ class CuMatrixBase {
   void DivElements(const CuMatrixBase<Real> &A);
   /// Do, elementwise, *this = max(*this, A).
   void Max(const CuMatrixBase<Real> &A);
+  /// Do, elementwise, *this = min(*this, A).
+  void Min(const CuMatrixBase<Real> &A);
   /// scale i'th column by scale[i]
   void MulColsVec(const CuVectorBase<Real> &scale);
   /// scale i'th row by scale[i]
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 523af1d70ec..50c23a7be63 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -1041,6 +1041,19 @@ template<typename Real> void MatrixBase<Real>::Max(const MatrixBase<Real> &A) {
   }
 }
 
+template<typename Real> void MatrixBase<Real>::Min(const MatrixBase<Real> &A) {
+  KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols());
+  for (MatrixIndexT row = 0; row < num_rows_; row++) {
+    Real *row_data = RowData(row);
+    const Real *other_row_data = A.RowData(row);
+    MatrixIndexT num_cols = num_cols_;
+    for (MatrixIndexT col = 0; col < num_cols; col++) {
+      row_data[col] = std::min(row_data[col],
+                               other_row_data[col]);
+    }
+  }
+}
+
 
 template<typename Real> void MatrixBase<Real>::Scale(Real alpha) {
   if (alpha == 1.0) return;
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index e254fcad118..25b999fe062 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -232,6 +232,8 @@ class MatrixBase {
 
   /// Set, element-by-element, *this = max(*this, A)
   void Max(const MatrixBase<Real> &A);
+  /// Set, element-by-element, *this = min(*this, A)
+  void Min(const MatrixBase<Real> &A);
 
   /// Equivalent to (*this) = (*this) * diag(scale).  Scaling
   /// each column by a scalar taken from that dimension of the vector.

From 6070209cf79fb5708ede8307f60210d055deaf1c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 17 Feb 2017 00:15:44 -0500
Subject: [PATCH 152/213] [src] Make various tests faster, especially without
 GPU (#1428)

---
 src/cudamatrix/cu-math-test.cc         |  4 +-
 src/cudamatrix/cu-matrix-speed-test.cc |  7 +-
 src/cudamatrix/cu-matrix-test.cc       | 30 ++------
 src/cudamatrix/cu-rand-speed-test.cc   |  3 +-
 src/gmm/am-diag-gmm-test.cc            |  4 +-
 src/nnet2/nnet-component-test.cc       | 10 +--
 src/nnet3/nnet-component-test.cc       | 11 +--
 src/nnet3/nnet-derivative-test.cc      | 98 ++++++++++++++------------
 8 files changed, 80 insertions(+), 87 deletions(-)

diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index abd93fb1a0a..6b9119b42c1 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -539,7 +539,7 @@ template<typename Real> void CudaMathUnitTest() {
 int main() {
   int32 loop = 0;
 #if HAVE_CUDA == 1
-  for (; loop < 2; loop++) {
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -564,7 +564,7 @@ int main() {
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
 #if HAVE_CUDA == 1
-  }
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index 0e139cf9ec3..5710963254a 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -1085,8 +1085,9 @@ template<typename Real> void CudaMatrixSpeedTest() {
 
 
 int main() {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
-  for (int32 loop = 0; loop < 2; loop++) {
+  for (loop = 0; loop < 2; loop++) {
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
     else
@@ -1104,8 +1105,8 @@ int main() {
     kaldi::CudaMatrixSpeedTest<double>();
 #endif
 #if HAVE_CUDA == 1
-  }
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
-  KALDI_LOG << "Tests succeeded.";
+  std::cout << "Tests succeeded.\n";
 }
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 6d172a36954..b0fcdf1d192 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -870,27 +870,6 @@ static void UnitTestCuMatrixMax() {
   AssertEqual(Ha,Ha2);
 }
 
-template<typename Real>
-static void UnitTestCuMatrixMin() {
-  Matrix<Real> Ha(100,100);
-  Matrix<Real> Hb(100,100);
-  Ha.SetRandn();
-  Hb.SetRandn();
-
-  CuMatrix<Real> Da(100,100);
-  CuMatrix<Real> Db(100,100);
-  Da.CopyFromMat(Ha);
-  Db.CopyFromMat(Hb);
-
-  Da.Min(Db);
-  Ha.Min(Hb);
-
-  Matrix<Real> Ha2(100,100);
-  Da.CopyToMat(&Ha2);
-
-  AssertEqual(Ha, Ha2);
-}
-
 
 
 template<typename Real>
@@ -2641,7 +2620,6 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixMulElements<Real>();
   UnitTestCuMatrixDivElements<Real>();
   UnitTestCuMatrixMax<Real>();
-  UnitTestCuMatrixMin<Real>();
   UnitTestCuMatrixMulColsVec<Real>();
   UnitTestCuMatrixMulRowsVec<Real>();
   UnitTestCuMatrixDivRowsVec<Real>();
@@ -2731,7 +2709,7 @@ template<typename Real> void CudaMatrixUnitTest() {
 int main() {
   int32 loop = 0;
 #if HAVE_CUDA == 1
-  for (; loop < 2; loop++) {
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -2741,6 +2719,7 @@ int main() {
 
     kaldi::CudaMatrixUnitTest<float>();
 
+
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
       kaldi::CudaMatrixUnitTest<double>();
@@ -2755,10 +2734,9 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-    
-    SetVerboseLevel(4);
 #if HAVE_CUDA == 1
-  }
+  } // No for loop if 'HAVE_CUDA != 1',
+  SetVerboseLevel(4);
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-rand-speed-test.cc b/src/cudamatrix/cu-rand-speed-test.cc
index abcae76c598..3c33b780a12 100644
--- a/src/cudamatrix/cu-rand-speed-test.cc
+++ b/src/cudamatrix/cu-rand-speed-test.cc
@@ -214,8 +214,7 @@ int main() {
     kaldi::CuRandGaussianVectorSpeedTest<double>(iter);
     fprintf(stderr, "--- ELAPSED %fs.\n\n", t.Elapsed());
 #if HAVE_CUDA == 1
-  } // NO for loop if 'HAVE_CUDA != 1',
-
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
   KALDI_LOG << "Tests succeeded.";
diff --git a/src/gmm/am-diag-gmm-test.cc b/src/gmm/am-diag-gmm-test.cc
index 54ca3c153ce..d40ef3df2e4 100644
--- a/src/gmm/am-diag-gmm-test.cc
+++ b/src/gmm/am-diag-gmm-test.cc
@@ -66,7 +66,7 @@ void TestAmDiagGmmIO(const AmDiagGmm &am_gmm) {
     loglike2 += am_gmm2->LogLikelihood(i, feat);
   kaldi::AssertEqual(loglike, loglike2, 1e-4);
   delete am_gmm2;
-  
+
   unlink("tmpf");
   unlink("tmpfb");
 }
@@ -122,7 +122,7 @@ void UnitTestAmDiagGmm() {
 }
 
 int main() {
-  for (int i = 0; i < 10; i++)
+  for (int i = 0; i < 5; i++)
     UnitTestAmDiagGmm();
   std::cout << "Test OK.\n";
   return 0;
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
index 4589ef52aa7..04e476c01bd 100644
--- a/src/nnet2/nnet-component-test.cc
+++ b/src/nnet2/nnet-component-test.cc
@@ -856,9 +856,9 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet2;
 
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     //// Uncomment the following line to expose the bug in UnitTestDropoutComponent
     //CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
@@ -868,7 +868,9 @@ int main() {
 #endif
 
     BasicDebugTestForSpliceMax(true);
-    for (int32 i = 0; i < 3; i++) {
+    // We used to test this 3 times, but now that nnet2 is rarely changed,
+    // reducing it to once.
+    for (int32 i = 0; i < 1; i++) {
       UnitTestGenericComponent<SigmoidComponent>();
       UnitTestGenericComponent<TanhComponent>();
       UnitTestGenericComponent<PowerComponent>("power=1.5");
@@ -905,8 +907,8 @@ int main() {
       else
         KALDI_LOG << "Tests with GPU use (if available) succeeded.";
     }
-  }
 #if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index 288179b2ffe..fdc9849dfc2 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -524,8 +524,9 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
   TestStringsApproxEqual();
-  for (kaldi::int32 loop = 0; loop < 2; loop++) {
+  kaldi::int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     //CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -533,9 +534,11 @@ int main() {
       CuDevice::Instantiate().SelectGpuId("yes");
 #endif
     UnitTestNnetComponent();
-  }
-
-  KALDI_LOG << "Nnet component ntests succeeded.";
+#if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  KALDI_LOG << "Nnet component tests succeeded.";
 
   return 0;
 }
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index f76377a544c..3a974fa0b6d 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -95,7 +95,7 @@ void UnitTestNnetModelDerivatives() {
     //gen_config.allow_nonlinearity = false;
     //gen_config.allow_recursion = false;
     //gen_config.allow_final_nonlinearity = true;
-
+    bool allow_optimization = true;
     bool limit_deriv_times = (RandInt(0, 2) == 0);
 
     std::vector<std::string> configs;
@@ -118,26 +118,45 @@ void UnitTestNnetModelDerivatives() {
     // whether input-derivatives are required or not does not matter,
     // so leave it as it is in that regard.
 
-    NnetOptimizeOptions optimize_opts;
-    CachingOptimizingCompilerOptions compiler_opts;
-    if (limit_deriv_times) {
-      SetDerivTimesOptions(request, &optimize_opts);
-    }
+    NnetComputation computation;
+    Compiler compiler(request, nnet);
 
-    CachingOptimizingCompiler compiler(nnet, optimize_opts,
-                                       compiler_opts);
+    CompilerOptions opts;
+    compiler.CreateComputation(opts, &computation);
+    {
+      std::ostringstream os;
+      computation.Print(os, nnet);
+      KALDI_LOG << "Generated computation is: " << os.str();
+    }
+    CheckComputationOptions check_config;
+    // we can do the rewrite check since it's before optimization.
+    check_config.check_rewrite = true;
+    ComputationChecker checker(check_config, nnet, computation);
+    checker.Check();
 
-    const NnetComputation &computation = *(compiler.Compile(request));
+    if (RandInt(0, 3) != 0 && allow_optimization) {
+      NnetOptimizeOptions opt_config;
+      if (limit_deriv_times)
+        SetDerivTimesOptions(request, &opt_config);
 
-    {
+      Optimize(opt_config, nnet, request, &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
+      check_config.check_rewrite = false;
+      ComputationChecker checker_opt(check_config, nnet, computation);
+      checker_opt.Check();
     }
 
+    NnetComputeOptions compute_opts;
+    if (RandInt(0, 1) == 0)
+      compute_opts.debug = true;
+    computation.ComputeCudaIndexes();
+
+
     Nnet nnet_deriv(nnet);
-    ScaleNnet(0.0, &nnet_deriv);
-    SetNnetAsGradient(&nnet_deriv);     // forces "simple" update and unit
+    bool is_gradient = true;
+    SetZero(is_gradient, &nnet_deriv);  // forces "simple" update and unit
                                         // learning rate.
 
     int32 num_directions = 4;  // must be >= 1.  Best if it's >1, will reduce
@@ -158,11 +177,6 @@ void UnitTestNnetModelDerivatives() {
                         nnet.OutputDim("output"));
     output_deriv.SetRandn();
 
-
-    NnetComputeOptions compute_opts;
-    if (RandInt(0, 1) == 0)
-      compute_opts.debug = true;
-
     // pass 0 is the forward pass with the un-perturbed model.
     // Other passes are with various differently-perturbed versions of
     // the model.
@@ -184,7 +198,7 @@ void UnitTestNnetModelDerivatives() {
       }
 
       KALDI_LOG << "Running forward computation";
-      computer.Run();
+      computer.Forward();
 
       const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
       KALDI_LOG << "Output sum for pass " << pass << " is " << output.Sum();
@@ -194,9 +208,9 @@ void UnitTestNnetModelDerivatives() {
       if (pass == 0) {
         // we need to do the backward computation (to get the model derivative)
         CuMatrix<BaseFloat> temp(output_deriv);
-        computer.AcceptInput("output", &temp);
+        computer.AcceptOutputDeriv("output", &temp);
         KALDI_LOG << "Running backward computation";
-        computer.Run();
+        computer.Backward();
       } else {
         // work out the predicted objf-change as dot-product of deriv and
         // parameter-change.  The expression below can be interpreted as
@@ -289,9 +303,7 @@ void UnitTestNnetInputDerivatives() {
     if (RandInt(0, 3) != 0 && allow_optimization) {
       NnetOptimizeOptions opt_config;
       // opt_config.initialize_undefined = false;  // temp
-      Optimize(opt_config, nnet,
-               MaxOutputTimeInRequest(request),
-               &computation);
+      Optimize(opt_config, nnet, request, &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
@@ -302,6 +314,13 @@ void UnitTestNnetInputDerivatives() {
       compute_opts.debug = true;
     computation.ComputeCudaIndexes();
 
+    // the only reason we might need to provide the &nnet parameter is if the
+    // StoreStats() operation had been requested.  We made sure no model update
+    // is being performed.
+    NnetComputer computer(compute_opts,
+                          computation,
+                          nnet,
+                          &nnet);
 
     int32 num_directions = 3;  // must be >= 1.  Best if it's >1, will reduce
                                // the probability of random failures.
@@ -330,18 +349,8 @@ void UnitTestNnetInputDerivatives() {
     // Other passes are with various differently-perturbed versions of
     // the features.
     for (int32 pass = 0; pass <= num_directions + 1; pass++) {
-      // the only reason we might need to provide the &nnet parameter is if the
-      // StoreStats() operation had been requested.  We made sure no model update
-      // is being performed.
-      NnetComputer computer(compute_opts,
-                            computation,
-                            nnet,
-                            &nnet);
-
-
       // provide the input to the computations.
       for (size_t i = 0; i < request.inputs.size(); i++) {
-
         CuMatrix<BaseFloat> temp(inputs[i]);
         if (pass > 0 && pass <= num_directions) {  // Perturb the input randomly.
           delta_inputs[i].Resize(inputs[i].NumRows(), inputs[i].NumCols());
@@ -360,7 +369,7 @@ void UnitTestNnetInputDerivatives() {
       }
 
       KALDI_LOG << "Running forward computation";
-      computer.Run();
+      computer.Forward();
 
       const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
       KALDI_LOG << "Output sum for pass " << pass << " is " << output.Sum();
@@ -370,11 +379,11 @@ void UnitTestNnetInputDerivatives() {
       if (pass == 0) {
         // We need to compute the input derivatives.
         CuMatrix<BaseFloat> temp(output_deriv);
-        computer.AcceptInput("output", &temp);
+        computer.AcceptOutputDeriv("output", &temp);
         KALDI_LOG << "Running backward computation";
-        computer.Run();
+        computer.Backward();
         for (size_t i = 0; i < request.inputs.size(); i++) {
-          input_derivs[i] = computer.GetOutput(request.inputs[i].name);
+          input_derivs[i] = computer.GetInputDeriv(request.inputs[i].name);
           KALDI_LOG << "Input-deriv norm for '" << request.inputs[i].name
                     << "' is " << input_derivs[i].FrobeniusNorm();
         }
@@ -416,11 +425,10 @@ void UnitTestNnetInputDerivatives() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-
-  // SetVerboseLevel(4);
-
-  for (kaldi::int32 loop = 0; loop < 2; loop++) {
+  kaldi::int32 loop = 0;
+  //SetVerboseLevel(2);
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -429,9 +437,11 @@ int main() {
 #endif
     UnitTestNnetModelDerivatives();
     UnitTestNnetInputDerivatives();
-  }
-
-  KALDI_LOG << "Nnet tests succeeded.";
+#if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  KALDI_LOG << "Nnet derivative tests succeeded.";
 
   return 0;
 }

From 4261d96f3c78c0e62c36aa2d038b6a5fed2d1f5c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 17 Feb 2017 01:33:41 -0500
Subject: [PATCH 153/213] [src] Make chain-supervision-test.cc faster when no
 GPU

---
 src/chain/chain-supervision-test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index e38fbca745f..33d3c74e3a3 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -606,9 +606,9 @@ void TestRanges() {
 
 int main() {
   using namespace kaldi;
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -621,7 +621,7 @@ int main() {
     }
     kaldi::chain::TestRanges();
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
   }
+  CuDevice::Instantiate().PrintProfile();
+#endif
 }

From bdf205d3076a16868f17d2b129c2b397a2c8b59f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 17 Feb 2017 14:31:02 -0500
Subject: [PATCH 154/213] [build] Updating version file-- this commit marks
 version 5.1.0

---
 src/.version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/.version b/src/.version
index 819e07a2243..a75b92f1ed7 100644
--- a/src/.version
+++ b/src/.version
@@ -1 +1 @@
-5.0
+5.1

From 0c831d6c18396bc78e5dccd913ca6d6457770522 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Fri, 17 Feb 2017 14:14:59 -0800
Subject: [PATCH 155/213] [build] Change travis build to use clang; remove
 -rdynamic from CXXFLAGS (it's in LDFLAGS)

Increase the number of parallel make jobs to 6.

Remove -rdynamic flag from CXXFLAGS. It is already in LDFLAGS.
---
 .travis.yml                             | 11 ++++++-----
 src/makefiles/darwin.mk                 |  6 +++---
 src/makefiles/linux_atlas.mk            |  9 ++++++++-
 src/makefiles/linux_atlas_arm.mk        |  9 ++++++++-
 src/makefiles/linux_atlas_ppc64le.mk    |  9 ++++++++-
 src/makefiles/linux_clapack.mk          |  9 ++++++++-
 src/makefiles/linux_clapack_arm.mk      |  9 ++++++++-
 src/makefiles/linux_openblas.mk         |  9 ++++++++-
 src/makefiles/linux_openblas_arm.mk     |  9 ++++++++-
 src/makefiles/linux_openblas_ppc64le.mk | 10 +++++++++-
 src/makefiles/linux_x86_64_mkl.mk       |  9 ++++++++-
 tools/extras/travis_script.sh           |  4 ++--
 12 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d3ad85363ce..54ac9f11c9f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,6 +2,8 @@ notifications:
   email: false
 
 language: cpp
+sudo: false
+dist: trusty
 
 os:
   - linux
@@ -9,13 +11,12 @@ os:
 addons:
   apt:
     sources:
-      - ubuntu-toolchain-r-test
+      - llvm-toolchain-trusty-3.9
     packages:
       - gdb
-      - gcc-4.9
-      - g++-4.9
-      - gfortran-4.9
+      - gfortran
       - liblapack-dev
+      - clang-3.9
 
 branches:
   only:
@@ -28,7 +29,7 @@ before_install:
   - tools/extras/travis_install_bindeps.sh $XROOT
 
 script:
-  - CXX=g++-4.9
+  - CXX=clang++-3.9
     CFLAGS="-march=native"
     LDFLAGS="-llapack"
     INCDIRS="$XROOT/usr/include"
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index dffcc878083..81351d185b6 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -22,9 +22,6 @@ ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
-
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
@@ -34,3 +31,6 @@ else ifeq ($(findstring GCC,$(COMPILER)),GCC)
 # Allow implicit conversions between vectors.
 CXXFLAGS += -flax-vector-conversions
 endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index b30c7ad5474..32a7f43fa50 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -21,12 +21,19 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
-           -msse -msse2 -pthread -rdynamic \
+           -msse -msse2 -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 35e98da51d7..4c83ce71d6c 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -21,12 +21,19 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
-           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index a5962f7964b..1e4194c2869 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -22,12 +22,19 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
-           -pthread -rdynamic \
+           -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 87e016aae5b..75a514a85d7 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -15,12 +15,19 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
-           -msse -msse2 -pthread -rdynamic \
+           -msse -msse2 -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index d21e640d3c1..52a2a663eb7 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -15,12 +15,19 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
-           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index d145c687438..1da16117a68 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -21,12 +21,19 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
-           -msse -msse2 -pthread -rdynamic \
+           -msse -msse2 -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 29a91752509..7f462925c74 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -21,12 +21,19 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
-           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread -rdynamic \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 6550d915c6c..c098b9d92e8 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -22,12 +22,20 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
-           -pthread -rdynamic \
+           -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+
 LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 50b4047def7..26d22253d08 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -29,13 +29,20 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL -I$(MKLROOT)/include \
-           -m64 -msse -msse2 -pthread -rdynamic \
+           -m64 -msse -msse2 -pthread \
            -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 ## Use the following for STATIC LINKING of the SEQUENTIAL version of MKL
 MKL_STA_SEQ = $(MKLLIB)/libmkl_solver_lp64_sequential.a -Wl,--start-group \
 	$(MKLLIB)/libmkl_intel_lp64.a $(MKLLIB)/libmkl_sequential.a \
diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index b3906450525..8aea788d9bc 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -4,12 +4,12 @@
 # Typical usage shown below; any one can be safely left unset.
 #   INCDIRS="~/xroot/usr/include"
 #   LIBDIRS="~/xroot/usr/lib /usr/lib/openblas-base"
-#   CXX=gcc++-4.9
+#   CXX=clang++-3.9
 #   CFLAGS="-march=native -O2"
 #   LDFLAGS="-llapack"
 
 # Maximum make parallelism. Simply -j runs out of memory on Travis VM.
-MAXPAR=4
+MAXPAR=6
 
 # Directories with code that can be tested with Travis (space-separated)
 TESTABLE_DIRS="src/"

From 1ef3964a52b3b0e51ea9e2d66795a8a6f3f6efb8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 19 Feb 2017 18:02:55 -0500
Subject: [PATCH 156/213] [scripts] fix syntax error in validate_lang.pl
 [thanks: daniel galvez]

---
 egs/wsj/s5/utils/validate_lang.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index e5bdf75787e..2e8125b1dd7 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -760,7 +760,7 @@ sub check_summation {
       if (! defined $is_disambig{$phone}) {
         if ($phone eq "<<eos>>") {
           $state = "eos";
-        } else if ($phone == 0) {
+        } elsif ($phone == 0) {
           $exit = 1; print "--> ERROR: unexpected phone sequence=$phoneseq, wordseq=$wordseq\n"; last;
         } else {
           $state = $wbtype{$phone};

From 9d34ff4d029639e7accd57dfee78a192ecf7dbc8 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sun, 19 Feb 2017 19:02:35 -0800
Subject: [PATCH 157/213] [build] Increase OpenFst version 1.6.0->1.6.1.
 (#1434)

---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index 4a8e08823a0..f40a75da5f8 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -7,7 +7,7 @@ CC = gcc         # used for sph2pipe
 
 # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
-OPENFST_VERSION = 1.6.0
+OPENFST_VERSION = 1.6.1
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")

From 1a7b72c0bea6e17fee96d65265ab7c30e718dc49 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Mon, 20 Feb 2017 13:11:58 -0500
Subject: [PATCH 158/213] [build] Use github not sourceforge for IRSTLM
 [sourceforge repo dead?] (#1435)

I couldn't figure out what version corresponds to svn -r 618, so I'm just using master.
---
 tools/extras/install_irstlm.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index 1bd9aea4aaa..91635a30bbc 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -18,8 +18,7 @@ if [ ! -x ./irstlm ] ; then
     exit 1
   fi
   (
-    svn -r 618 co --non-interactive --trust-server-cert \
-      https://svn.code.sf.net/p/irstlm/code/trunk irstlm
+    git clone git@github.com:irstlm-team/irstlm.git irstlm
   ) || {
     errcho "****() Error getting the IRSTLM sources. The server hosting it"
     errcho "****() might be down."

From e5304f83e3e4fa996a78d4f68ceeb1f1ee8c01ef Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Mon, 20 Feb 2017 14:00:58 -0500
Subject: [PATCH 159/213] [egs] clean up the HKUST scripts and add scoring
 filters (#1436)

Some fixes to scoring (e.g. don't split english words into characters, only chinese ones).
Modify scoring to produce CER and WER numbers.
---
 egs/hkust/s5/local/character_tokenizer   | 32 ++++++++++++++++++++++++
 egs/hkust/s5/local/hkust_data_prep.sh    |  7 +++---
 egs/hkust/s5/local/hkust_prepare_dict.sh |  3 +--
 egs/hkust/s5/local/hkust_train_lms.sh    |  6 ++++-
 egs/hkust/s5/local/score.sh              |  9 ++++++-
 egs/hkust/s5/local/wer_output_filter     | 25 ++++++++++++++++++
 6 files changed, 74 insertions(+), 8 deletions(-)
 create mode 100755 egs/hkust/s5/local/character_tokenizer
 mode change 120000 => 100755 egs/hkust/s5/local/score.sh
 create mode 100755 egs/hkust/s5/local/wer_output_filter

diff --git a/egs/hkust/s5/local/character_tokenizer b/egs/hkust/s5/local/character_tokenizer
new file mode 100755
index 00000000000..a3d8098d17f
--- /dev/null
+++ b/egs/hkust/s5/local/character_tokenizer
@@ -0,0 +1,32 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print " $s";
+    } else {
+      @chars = split "", $s;
+      foreach $c (@chars) {
+        if ($c =~ /\p{InCJK_Unified_Ideographs}/) {
+          print " $c";
+        } else {
+          print "$c";
+        }
+      }
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh
index 07f3c9677d8..207f03af36b 100755
--- a/egs/hkust/s5/local/hkust_data_prep.sh
+++ b/egs/hkust/s5/local/hkust_data_prep.sh
@@ -104,8 +104,8 @@ awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];e
    print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments
 awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp
 
-sph2pipe=`cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
-[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
+sph2pipe=`which sph2pipe` || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
 
 cat $train_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
     printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
@@ -136,5 +136,4 @@ cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir
 cat $dev_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1;
 
 echo "$0: HKUST data preparation succeeded"
-
-exit;
+exit 0
diff --git a/egs/hkust/s5/local/hkust_prepare_dict.sh b/egs/hkust/s5/local/hkust_prepare_dict.sh
index 5cd864c52cc..6aca37586ed 100755
--- a/egs/hkust/s5/local/hkust_prepare_dict.sh
+++ b/egs/hkust/s5/local/hkust_prepare_dict.sh
@@ -312,5 +312,4 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ",
  cat - $dict_dir/lexicon1.txt  > $dict_dir/lexicon.txt || exit 1;
 
 echo "$0: HKUST dict preparation succeeded"
-
-exit;
+exit 0;
diff --git a/egs/hkust/s5/local/hkust_train_lms.sh b/egs/hkust/s5/local/hkust_train_lms.sh
index d6d0b2aa0bc..8520bb26d2d 100755
--- a/egs/hkust/s5/local/hkust_train_lms.sh
+++ b/egs/hkust/s5/local/hkust_train_lms.sh
@@ -19,9 +19,13 @@ done
 dir=data/local/lm
 mkdir -p $dir
 
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+                # have a different locale.
 kaldi_lm=`which train_lm.sh`
 if [ ! -x $kaldi_lm ]; then
-  echo "train_lm.sh is not found. Checkout tools/extra/install_kaldi_lm.sh"
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
   exit 1
 fi
 
diff --git a/egs/hkust/s5/local/score.sh b/egs/hkust/s5/local/score.sh
deleted file mode 120000
index df664a0f1f1..00000000000
--- a/egs/hkust/s5/local/score.sh
+++ /dev/null
@@ -1 +0,0 @@
-../steps/scoring/score_kaldi_cer.sh
\ No newline at end of file
diff --git a/egs/hkust/s5/local/score.sh b/egs/hkust/s5/local/score.sh
new file mode 100755
index 00000000000..766eaf3cd44
--- /dev/null
+++ b/egs/hkust/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/hkust/s5/local/wer_output_filter b/egs/hkust/s5/local/wer_output_filter
new file mode 100755
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/hkust/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+

From ac69d1af7935bacf5292dadf2467592d6221827c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 20 Feb 2017 19:02:42 -0500
Subject: [PATCH 160/213] [src,doc] Update version documentation with version
 5.1; makefile fix; add details to info string for nnet3 component.

---
 src/doc/get_version_info.sh         |  6 +++++-
 src/doc/versions.dox                | 10 ++++++----
 src/fstbin/Makefile                 |  5 ++---
 src/nnet3/nnet-general-component.cc |  6 +++++-
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/doc/get_version_info.sh b/src/doc/get_version_info.sh
index b37ac5f400f..568e53c88dd 100755
--- a/src/doc/get_version_info.sh
+++ b/src/doc/get_version_info.sh
@@ -28,7 +28,11 @@ fi
 
 # Note: when you add new tuples here you'll want to add ndew
 # \htmlinclude directives in versions.dox.
-for tuple in "5.0 master c160a9883"; do
+# the tuples will generally be of the form: "x.x master yyyyyy"
+# where yyyyy is the result of git log -1 src/.version done on
+# that version of Kaldi (we only update the .version file when
+# the major/minor version number changes).
+for tuple in "5.0 master c160a9883" "5.1 master 2145519961"; do
   major_minor_number=$(echo $tuple | awk '{print $1}')  # e.g. 5.0
   branch=$(echo $tuple | awk '{print $2}')  # e.g. 'master', or '5.1' (it's a branch name)
   first_commit=$(echo $tuple | awk '{print $3}')
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index 56cdcdf4118..0a16c5f1d3a 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -19,7 +19,7 @@
 
 // note: you have to run the file get_version_info.sh in order
 // to generate the HTML files that we include via \htmlinclude.
-
+// Any time you add a new version you need to edit get_version_info.sh
 
 
 /**
@@ -62,7 +62,8 @@
    \subsection versions_versions_50 Version 5.0
 
   This is the first major/minor version number after introducing the versioning scheme.
-  It is currently available in the 'master' branch on github.
+  The latest revision of version 5.0 is saved as branch "5.0" on github.
+
   Specific patches:
 
   \htmlinclude 5.0.html
@@ -70,8 +71,7 @@
 
    \subsection versions_versions_51 Version 5.1
 
-  Version 5.1 is in preparation and version 5.1.0 does not actually exist yet.
-  You can see the development in the 'shortcut' branch on github.
+  Version 5.1 is the current master branch of Kaldi.
   Some of the major changes introduced in version 5.1 are:
      - Kaldi now requires C++11 to compile, and we support only the latest
        version of OpenFst (1.6.0).  (This simplifies Kaldi's code, and will later
@@ -90,5 +90,7 @@
      - The sequence-training scripts in nnet3 are refactored and are now simpler
        and use less disk space.
 
+  \htmlinclude 5.1.html
+
 
 */
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index 8d544e40ea0..da26c58edd7 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -15,8 +15,7 @@ BINFILES = fstdeterminizestar  \
            fstmakecontextsyms fstaddsubsequentialloop fstaddselfloops  \
            fstrmepslocal fstcomposecontext fsttablecompose fstrand fstfactor \
            fstdeterminizelog fstphicompose fstrhocompose fstpropfinal fstcopy \
-	         fstpushspecial fsts-to-transcripts fsts-project fsts-union \
-					 fsts-scale fsts-difference
+           fstpushspecial fsts-to-transcripts fsts-project fsts-union
 
 OBJFILES =
 
@@ -27,6 +26,6 @@ LIBFILE =
 
 ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a \
           ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 8cfad20f19e..4aa65ce70ed 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -19,6 +19,7 @@
 
 #include <iterator>
 #include <sstream>
+#include <iomanip>
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-computation-graph.h"
 #include "nnet3/nnet-parse.h"
@@ -556,7 +557,7 @@ void StatisticsPoolingComponent::InitFromConfig(ConfigLine *cfl) {
 
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
+              << cfl->UnusedValues();
   // do some basic checks here but Check() will check more completely.
   if (!ok || input_dim_ <= 0 || left_context_ + right_context_ <= 0 ||
       num_log_count_features_ < 0)
@@ -968,10 +969,13 @@ std::string BackpropTruncationComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_
          << ", scale=" << scale_
+         << ", count=" << std::setprecision(3) << count_ << std::setprecision(6)
+         << ", recurrence-interval=" << recurrence_interval_
          << ", clipping-threshold=" << clipping_threshold_
          << ", clipped-proportion="
          << (count_ > 0.0 ? num_clipped_ / count_ : 0)
          << ", zeroing-threshold=" << zeroing_threshold_
+         << ", zeroing-interval=" << zeroing_interval_
          << ", zeroed-proportion="
          << (count_zeroing_boundaries_ > 0.0 ?
              num_zeroed_ / count_zeroing_boundaries_ : 0)

From dc7864aee492b5b81151be537684506a5604be54 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Tue, 21 Feb 2017 12:24:32 +0800
Subject: [PATCH 161/213] [build] update .gitignore: ignore openfst-1.6.1
 (#1439)

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index e6d9c0fd612..02a88acb621 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,6 +92,8 @@ GSYMS
 /tools/openfst-1.5.4/
 /tools/openfst-1.6.0.tar.gz
 /tools/openfst-1.6.0/
+/tools/openfst-1.6.1.tar.gz
+/tools/openfst-1.6.1/
 /tools/pa_stable_v19_20111121.tgz
 /tools/portaudio/
 /tools/sctk-2.4.0-20091110-0958.tar.bz2

From 0e5cea8f12056782f61560690a79419a3fc651d9 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 21 Feb 2017 00:28:25 -0500
Subject: [PATCH 162/213] [src,egs,scripts] Support frame-subsampling with
 non-chain nnet3 models (#1238)

---
 egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh      |   1 +
 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh | 130 ++++++++++++++
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh  | 161 +++++++++++++++++
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh  | 163 ++++++++++++++++++
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh  | 162 +++++++++++++++++
 egs/wsj/s5/steps/nnet3/chain/build_tree.sh    |  17 +-
 src/bin/convert-ali.cc                        |   7 +
 src/hmm/hmm-utils-test.cc                     |   8 +-
 src/hmm/hmm-utils.cc                          |  96 ++++++++++-
 src/hmm/hmm-utils.h                           |  10 ++
 10 files changed, 740 insertions(+), 15 deletions(-)
 create mode 120000 egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh
 create mode 100755 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
 create mode 100755 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
 create mode 100755 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
 create mode 100755 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh

diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh
new file mode 120000
index 00000000000..bff3b4164f7
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lfr1c.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
new file mode 100755
index 00000000000..98cd8d5f34f
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# e is as c, but uses splicing similar to chain's without changing number of
+# layers.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_c    tdnn_e
+# WER on train_dev(tg)      17.37     16.75
+# WER on train_dev(fg)      15.94     15.34
+# WER on eval2000(tg)        20.0      19.5
+# WER on eval2000(fg)        18.2      18.0
+# Final train prob       -1.43781  -1.40491
+# Final valid prob       -1.56895  -1.55255
+
+
+stage=9
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+#exp/nnet3/tdnn_c_sp/egs
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_e
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+if [ $stage -le 11 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
new file mode 100755
index 00000000000..a82b2078acb
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# _lfr1a is as _c, but is LFR (low frame rate): it uses triphone chain topology
+#  with a frame subsampling factor of 3.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_c   tdnn_lfr1a
+# WER on train_dev(tg)      17.37     17.25
+# WER on train_dev(fg)      15.94     15.90
+# WER on eval2000(tg)        20.0      20.1
+# WER on eval2000(fg)        18.2      18.5
+# Final train prob       -1.43781  -1.32434
+# Final valid prob       -1.56895  -1.42206
+
+
+stage=11
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+reporting_email=
+remove_egs=true
+leftmost_questions_truncate=-1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_lfr1a
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/nnet3/tdnn_lfr1b_tree$suffix
+lang=data/lang_lfr1b
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --repeat-frames true --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 8400 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,2) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-3,3) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-7,2) dim=1024
+  relu-renorm-layer name=tdnn5 dim=1024
+
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $treedir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+echo 3 >$dir/frame_subsampling_factor
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
new file mode 100755
index 00000000000..8c80dc3d7ad
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+# _lfr1b is as _lfr1a, but with one more -3,3 layer (the comparable
+# non-LFR system is tdnn_d)
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_d  tdnn_lfr1a  tdnn_lfr1b
+# WER on train_dev(tg)      16.72     17.25     17.00
+# WER on train_dev(fg)      15.31     15.90     15.57
+# WER on eval2000(tg)        19.2      20.1      19.3
+# WER on eval2000(fg)        17.8      18.5      17.8
+# Final train prob       -1.22859  -1.32434  -1.11497
+# Final valid prob         -1.354  -1.42206  -1.21105
+
+
+
+stage=0
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+reporting_email=
+remove_egs=true
+leftmost_questions_truncate=-1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_lfr1b
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/nnet3/tdnn_lfr1b_tree$suffix
+lang=data/lang_lfr1b
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --repeat-frames true --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 8400 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,2) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-3,3) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
+  relu-renorm-layer name=tdnn6 dim=1024
+  
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $treedir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+echo 3 >$dir/frame_subsampling_factor
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
new file mode 100755
index 00000000000..95cdbf7f975
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+# _lfr1c is as _lfr1a, but uses splicing similar to chain's without changing
+# number of layers (comparable non-LFR system is tdnn_e).
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_c    tdnn_e  tdnn_lfr1c
+# WER on train_dev(tg)      17.37     16.75     17.10
+# WER on train_dev(fg)      15.94     15.34     15.74
+# WER on eval2000(tg)        20.0      19.5      19.2
+# WER on eval2000(fg)        18.2      18.0      17.7
+# Final train prob       -1.43781  -1.40491  -1.29898
+# Final valid prob       -1.56895  -1.55255  -1.43117
+
+
+stage=11
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+#exp/nnet3/tdnn_lfr1b_sp/egs
+reporting_email=
+remove_egs=true
+leftmost_questions_truncate=-1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_lfr1c
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/nnet3/tdnn_lfr1b_tree$suffix
+lang=data/lang_lfr1b
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --repeat-frames true --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 8400 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $treedir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+echo 3 >$dir/frame_subsampling_factor
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
index 72bc91c6014..280ab4ee0b2 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -27,6 +27,7 @@ leftmost_questions_truncate=-1  # note: this used to default to 10, but we never
                                 # we're changing the default
 tree_stats_opts=
 cluster_phones_opts=
+repeat_frames=false
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -41,6 +42,15 @@ if [ $# != 5 ]; then
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # stage to do partial re-run from."
+  echo "  --repeat-frames <true|false>                     # Only affects alignment conversion at"
+  echo "                                                   # the end. If true, generate an "
+  echo "                                                   # alignment using the frame-subsampled "
+  echo "                                                   # topology that is repeated "
+  echo "                                                   # --frame-subsampling-factor times "
+  echo "                                                   # and interleaved, to be the same "
+  echo "                                                   # length as the original alignment "
+  echo "                                                   # (useful for cross-entropy training "
+  echo "                                                   # of reduced frame rate systems)."
   exit 1;
 fi
 
@@ -173,9 +183,10 @@ if [ $stage -le -1 ]; then
   # for other purposes.
   echo "$0: Converting alignments from $alidir to use current tree"
   $cmd JOB=1:$nj $dir/log/convert.JOB.log \
-    convert-ali --frame-subsampling-factor=$frame_subsampling_factor \
-       $alidir/final.mdl $dir/1.mdl $dir/tree \
-     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+    convert-ali --repeat-frames=$repeat_frames \
+      --frame-subsampling-factor=$frame_subsampling_factor \
+      $alidir/final.mdl $dir/1.mdl $dir/tree \
+      "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
 fi
 
 cp $dir/1.mdl $dir/final.mdl
diff --git a/src/bin/convert-ali.cc b/src/bin/convert-ali.cc
index 3a52b7904a0..89fe838638c 100644
--- a/src/bin/convert-ali.cc
+++ b/src/bin/convert-ali.cc
@@ -39,6 +39,7 @@ int main(int argc, char *argv[]) {
 
     int32 frame_subsampling_factor = 1;
     bool reorder = true;
+    bool repeat_frames = false;
 
     std::string phone_map_rxfilename;
     ParseOptions po(usage);
@@ -48,6 +49,11 @@ int main(int argc, char *argv[]) {
     po.Register("reorder", &reorder,
                 "True if you want the converted alignments to be 'reordered' "
                 "versus the way they appear in the HmmTopology object");
+    po.Register("repeat-frames", &repeat_frames,
+                "Only relevant when frame-subsampling-factor != 1.  If true, "
+                "repeat frames of alignment by 'frame-subsampling-factor' "
+                "after alignment conversion, to keep the alignment the same "
+                "length as the input alignment.");
     po.Register("frame-subsampling-factor", &frame_subsampling_factor,
                 "Can be used in converting alignments to reduced frame rates.");
 
@@ -98,6 +104,7 @@ int main(int argc, char *argv[]) {
                            new_ctx_dep,
                            old_alignment,
                            frame_subsampling_factor,
+                           repeat_frames,
                            reorder,
                            (phone_map_rxfilename != "" ? &phone_map : NULL),
                            &new_alignment)) {
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index 805b77ce7f0..69728cc8ca7 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -292,8 +292,8 @@ void TestConvertAlignment() {
   std::vector<int32> new_alignment;
 
   bool ans = ConvertAlignment(trans_model_old, trans_model_new, *ctx_dep_new,
-                              old_alignment, subsample_factor, new_reorder,
-                              NULL, &new_alignment);
+                              old_alignment, subsample_factor, false,
+                              new_reorder, NULL, &new_alignment);
   if(!ans) {
     KALDI_WARN << "Alignment conversion failed";
     // make sure it failed for a good reason.
@@ -311,8 +311,8 @@ void TestConvertAlignment() {
       // we should be able to convert back and it'll be the same.
       std::vector<int32> old_alignment_copy;
       bool ans = ConvertAlignment(trans_model_new, trans_model_old, *ctx_dep_old,
-                                  new_alignment, subsample_factor, old_reorder,
-                                  NULL, &old_alignment_copy);
+                                  new_alignment, subsample_factor, false,
+                                  old_reorder, NULL, &old_alignment_copy);
       KALDI_ASSERT(ans);
       KALDI_ASSERT(old_alignment_copy == old_alignment);
     }
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index ab0b133f708..fe6c5b32d6e 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -772,18 +772,38 @@ static inline void ConvertAlignmentForPhone(
 }
 
 
+
 /**
-   This function, called from ConvertAlignment(), works out suitable new lengths
-   of phones in the case where subsample_factor != 1.  The input vectors
+   This function, called from ConvertAlignmentInternal(), works out suitable new
+   lengths of phones in the case where subsample_factor != 1.  The input vectors
    'mapped_phones' and 'old_lengths' must be the same size-- the length of the
    phone sequence.  The 'topology' object and 'mapped_phones' are needed to
    work out the minimum length of each phone in the sequence.
-   Returns true only if it could not assign lengths (because the topology was
+   Returns false only if it could not assign lengths (because the topology was
    too long relative to the number of frames).
+
+   @param topology [in]         The new phone lengths are computed with
+                                regard to this topology
+   @param mapped_phones [in]    The phones for which this function computes
+                                new lengths
+   @param old_lengths     [in]  The old lengths
+   @param conversion_shift [in] This will normally equal subsample_factor - 1
+                                but may be less than that if the 'repeat_frames'
+                                option is true; it's used for generating
+                                'frame-shifted' versions of alignments that
+                                we will later interpolate. This helps us keep
+                                the phone boundaries of the subsampled and
+                                interpolated alignments the same as
+                                the original alignment.
+   @param subsample_factor [in] The frame subsampling factor... normally 1, but
+                                might be > 1 if we're converting to a
+                                reduced-frame-rate system.
+   @param new_lengths [out]     The vector for storing new lengths.
 */
 static bool ComputeNewPhoneLengths(const HmmTopology &topology,
                                    const std::vector<int32> &mapped_phones,
                                    const std::vector<int32> &old_lengths,
+                                   int32 conversion_shift,
                                    int32 subsample_factor,
                                    std::vector<int32> *new_lengths) {
   int32 phone_sequence_length = old_lengths.size();
@@ -797,10 +817,10 @@ static bool ComputeNewPhoneLengths(const HmmTopology &topology,
     // the subsampled alignments have the same length as features
     // subsampled with 'subsample-feats'.
     int32 subsampled_time =
-        (cur_time_elapsed + subsample_factor - 1) / subsample_factor;
+        (cur_time_elapsed + conversion_shift) / subsample_factor;
     cur_time_elapsed += old_lengths[i];
     int32 next_subsampled_time =
-        (cur_time_elapsed + subsample_factor - 1) / subsample_factor;
+        (cur_time_elapsed + conversion_shift) / subsample_factor;
     (*new_lengths)[i] = next_subsampled_time - subsampled_time;
   }
   bool changed = true;
@@ -850,14 +870,23 @@ static bool ComputeNewPhoneLengths(const HmmTopology &topology,
   return true;
 }
 
-bool ConvertAlignment(const TransitionModel &old_trans_model,
+/**
+  This function is the same as 'ConvertAligment',
+  but instead of the 'repeat_frames' option it supports the 'conversion_shift'
+  option; see the documentation of ComputeNewPhoneLengths() for what
+  'conversion_shift' is for.
+*/
+
+static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
                       const TransitionModel &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
+                      int32 conversion_shift,
                       int32 subsample_factor,
                       bool new_is_reordered,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
+  KALDI_ASSERT(0 <= conversion_shift && conversion_shift < subsample_factor);
   bool old_is_reordered = IsReordered(old_trans_model, old_alignment);
   KALDI_ASSERT(new_alignment != NULL);
   new_alignment->clear();
@@ -893,7 +922,7 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
     for (int32 i = 0; i < phone_sequence_length; i++)
       old_lengths[i] = old_split[i].size();
     if (!ComputeNewPhoneLengths(new_trans_model.GetTopo(),
-                                mapped_phones, old_lengths,
+                                mapped_phones, old_lengths, conversion_shift,
                                 subsample_factor, &new_lengths)) {
       KALDI_WARN << "Failed to produce suitable phone lengths";
       return false;
@@ -931,7 +960,58 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
     }
   }
   KALDI_ASSERT(new_alignment->size() ==
-               (old_alignment.size() + subsample_factor - 1)/subsample_factor);
+               (old_alignment.size() + conversion_shift)/subsample_factor);
+  return true;
+}
+
+bool ConvertAlignment(const TransitionModel &old_trans_model,
+                      const TransitionModel &new_trans_model,
+                      const ContextDependencyInterface &new_ctx_dep,
+                      const std::vector<int32> &old_alignment,
+                      int32 subsample_factor,
+                      bool repeat_frames,
+                      bool new_is_reordered,
+                      const std::vector<int32> *phone_map,
+                      std::vector<int32> *new_alignment) {
+  if (!repeat_frames || subsample_factor == 1) {
+    return ConvertAlignmentInternal(old_trans_model,
+                                    new_trans_model,
+                                    new_ctx_dep,
+                                    old_alignment,
+                                    subsample_factor - 1,
+                                    subsample_factor,
+                                    new_is_reordered,
+                                    phone_map,
+                                    new_alignment);
+   // The value "subsample_factor - 1" for conversion_shift above ensures the
+   // alignments have the same length as the output of 'subsample-feats'
+  } else {
+    std::vector<std::vector<int32> > shifted_alignments(subsample_factor);
+    for (int32 conversion_shift = subsample_factor - 1;
+         conversion_shift >= 0; conversion_shift--) {
+      if (!ConvertAlignmentInternal(old_trans_model,
+                                    new_trans_model,
+                                    new_ctx_dep,
+                                    old_alignment,
+                                    conversion_shift,
+                                    subsample_factor,
+                                    new_is_reordered,
+                                    phone_map,
+                                    &shifted_alignments[conversion_shift]))
+        return false;
+    }
+    KALDI_ASSERT(new_alignment != NULL);
+    new_alignment->clear();
+    new_alignment->reserve(old_alignment.size());
+    int32 max_shifted_ali_length = (old_alignment.size() / subsample_factor)
+                                   + (old_alignment.size() % subsample_factor);
+    for (int32 i = 0; i < max_shifted_ali_length; i++)
+      for (int32 conversion_shift = subsample_factor - 1;
+           conversion_shift >= 0; conversion_shift--)
+        if (i < static_cast<int32>(shifted_alignments[conversion_shift].size()))
+          new_alignment->push_back(shifted_alignments[conversion_shift][i]);
+  }
+  KALDI_ASSERT(new_alignment->size() == old_alignment.size());
   return true;
 }
 
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 1af62c646be..3d51cbe1f14 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -245,6 +245,15 @@ bool SplitToPhones(const TransitionModel &trans_model,
    @param subsample_factor [in] The frame subsampling factor... normally 1, but
                                 might be > 1 if we're converting to a reduced-frame-rate
                                 system.
+   @param repeat_frames [in]    Only relevant when subsample_factor != 1
+                                If true, repeat frames of alignment by
+                                'subsample_factor' after alignment
+                                conversion, to keep the alignment the same
+                                length as the input alignment.
+                                [note: we actually do this by interpolating
+                                'subsample_factor' separately generated
+                                alignments, to keep the phone boundaries
+                                the same as the input where possible.]
    @param reorder [in]          True if you want the pdf-ids on the new alignment to
                                 be 'reordered'. (vs. the way they appear in
                                 the HmmTopology object)
@@ -257,6 +266,7 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,  // 1 in the normal case -> no subsampling.
+                      bool repeat_frames,
                       bool reorder,
                       const std::vector<int32> *phone_map,  // may be NULL
                       std::vector<int32> *new_alignment);

From dda277d8bcb038949a826cd2a395b444bb26ba28 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Tue, 21 Feb 2017 13:44:10 -0500
Subject: [PATCH 163/213] [build] cloning IRSTLM, use https to access github,
 not ssh (#1441)

---
 tools/extras/install_irstlm.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index 91635a30bbc..c6cc9adf568 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -12,13 +12,13 @@ errcho() { echo "$@" 1>&2; }
 errcho "****() Installing IRSTLM"
 
 if [ ! -x ./irstlm ] ; then
-  svn=`which svn`
+  svn=`which git`
   if [ $? != 0 ]  ; then
-    errcho "****() You need to have svn (subversion) installed"
+    errcho "****() You need to have git installed"
     exit 1
   fi
   (
-    git clone git@github.com:irstlm-team/irstlm.git irstlm
+    git clone https://github.com/irstlm-team/irstlm.git irstlm
   ) || {
     errcho "****() Error getting the IRSTLM sources. The server hosting it"
     errcho "****() might be down."
@@ -43,6 +43,7 @@ fi
 ) || {
   errcho "***() Error compiling IRSTLM. The error messages could help you "
   errcho "***() in figuring what went wrong."
+  exit 1
 }
 
 (

From 1b5563c78effc1e93cb40223a68d63dec1d39778 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 21 Feb 2017 23:53:02 -0800
Subject: [PATCH 164/213] [build] Update Travis configuration to get builds to
 complete on time. (#1443)

Run the build in a container-based Ubuntu 12.04 virtual environment.
Compile with clang-3.8. Add Travis scripts to Travis testable files.
---
 .travis.yml                   | 13 +++++++------
 tools/extras/travis_script.sh |  5 +++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 54ac9f11c9f..f8e2bac0362 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,8 +2,6 @@ notifications:
   email: false
 
 language: cpp
-sudo: false
-dist: trusty
 
 os:
   - linux
@@ -11,12 +9,15 @@ os:
 addons:
   apt:
     sources:
-      - llvm-toolchain-trusty-3.9
+      - ubuntu-toolchain-r-test
+      - llvm-toolchain-precise-3.8
     packages:
       - gdb
-      - gfortran
+      - gcc-4.9
+      - g++-4.9
+      - gfortran-4.9
       - liblapack-dev
-      - clang-3.9
+      - clang-3.8
 
 branches:
   only:
@@ -29,7 +30,7 @@ before_install:
   - tools/extras/travis_install_bindeps.sh $XROOT
 
 script:
-  - CXX=clang++-3.9
+  - CXX=clang++-3.8
     CFLAGS="-march=native"
     LDFLAGS="-llapack"
     INCDIRS="$XROOT/usr/include"
diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 8aea788d9bc..d1b9049ef22 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -4,7 +4,7 @@
 # Typical usage shown below; any one can be safely left unset.
 #   INCDIRS="~/xroot/usr/include"
 #   LIBDIRS="~/xroot/usr/lib /usr/lib/openblas-base"
-#   CXX=clang++-3.9
+#   CXX=clang++-3.8
 #   CFLAGS="-march=native -O2"
 #   LDFLAGS="-llapack"
 
@@ -38,7 +38,8 @@ runvx env
 # However, do run tests if TRAVIS_COMMIT_RANGE does not parse. This
 # most likely means the branch was reset by --force; re-run tests then.
 if git rev-parse "${TRAVIS_COMMIT_RANGE}" >/dev/null 2>&1 && \
-   ! git diff --name-only "${TRAVIS_COMMIT_RANGE}" -- ${TESTABLE_DIRS} | read REPLY
+   ! git diff --name-only "${TRAVIS_COMMIT_RANGE}" -- ${TESTABLE_DIRS} \
+   .travis.yml tools/extras/travis_*.sh | read REPLY
 then
   echo; echo "No changes outside ${TESTABLE_DIRS} in the commit" \
              "range ${TRAVIS_COMMIT_RANGE}; reporting success."

From fe4098c425844c8b83898d1f8dd36a1e2ed55b5f Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Wed, 22 Feb 2017 13:13:40 -0500
Subject: [PATCH 165/213] [egs] Iban recipe: MacOSX compatibility fixes (#1448)

---
 egs/iban/s5/local/prepare_lm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/iban/s5/local/prepare_lm.sh b/egs/iban/s5/local/prepare_lm.sh
index a19dc18f566..10d5e276aa3 100755
--- a/egs/iban/s5/local/prepare_lm.sh
+++ b/egs/iban/s5/local/prepare_lm.sh
@@ -10,7 +10,7 @@ set -e -o pipefail
 
 local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
 
-nl -nrz -w10  corpus/LM/iban-bp-2012.txt | sort -R > data/local/external_text
+nl -nrz -w10  corpus/LM/iban-bp-2012.txt | utils/shuffle_list.pl > data/local/external_text
 local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external
 
 # let's do ngram interpolation of the previous two LMs
@@ -21,7 +21,7 @@ for w in 0.9 0.8 0.7 0.6 0.5; do
     ngram -lm data/srilm/lm.gz  -mix-lm data/srilm_external/lm.gz \
           -lambda $w -write-lm data/srilm_interp/lm.${w}.gz
     echo -n "data/srilm_interp/lm.${w}.gz "
-    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s
+    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s -
 done | sort  -k15,15g  > data/srilm_interp/perplexities.txt
 
 # for basic decoding, let's use only a trigram LM

From f2b29c9b0a156dab1e06c3635d5c95e94630f7af Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 23 Feb 2017 00:51:08 -0500
Subject: [PATCH 166/213] [scripts] Make it so i-vector ID is not required for
 steps/nnet3/decode.sh

---
 egs/wsj/s5/steps/nnet3/decode.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 8aa86e92dcb..35a02001ae7 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -68,7 +68,7 @@ model=$srcdir/$iter.mdl
 extra_files=
 if [ ! -z "$online_ivector_dir" ]; then
   steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
-  extra_files="$srcdir/final.ie.id $online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
 fi
 
 for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do

From ca3d53c4c305886180340ac8028eeb3f416a786a Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Thu, 23 Feb 2017 13:27:45 -0500
Subject: [PATCH 167/213] [build] Stop env.sh from crashing when set -u is
 active (#1451)

---
 .gitignore                       | 1 +
 tools/extras/install_irstlm.sh   | 4 ++--
 tools/extras/install_liblbfgs.sh | 6 +++---
 tools/extras/install_mpg123.sh   | 4 ++--
 tools/extras/install_sequitur.sh | 6 +++---
 tools/extras/install_srilm.sh    | 4 ++--
 6 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 02a88acb621..fd1f73af215 100644
--- a/.gitignore
+++ b/.gitignore
@@ -120,6 +120,7 @@ GSYMS
 /tools/pthreads
 /tools/pthreads*.zip
 /tools/sequitur
+/tools/sequitur-g2p
 /tools/srilm.tgz
 /tools/liblbfgs-1.10.tar.gz
 /tools/liblbfgs-1.10/
diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index c6cc9adf568..8b0f8b6519e 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -47,12 +47,12 @@ fi
 }
 
 (
-  [ ! -z ${IRSTLM} ] && \
+  [ ! -z "${IRSTLM}" ] && \
     echo >&2 "IRSTLM variable is aleady defined. Undefining..." && \
     unset IRSTLM
 
   [ -f ./env.sh ] && . ./env.sh
-  [ ! -z ${IRSTLM} ] && \
+  [ ! -z "${IRSTLM}" ] && \
     echo >&2 "IRSTLM config is already in env.sh" && exit
 
   wd=`pwd -P`
diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh
index 7e6589b160d..10f72cad84f 100644
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
@@ -14,19 +14,19 @@ make -i install
 cd ..
 
 (
-  [ ! -z ${LIBLBFGS} ] && \
+  [ ! -z "${LIBLBFGS}" ] && \
     echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \
     unset LIBLBFGS
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${LIBLBFGS} ] && \
+  [ ! -z "${LIBLBFGS}" ] && \
     echo >&2 "libLBFGS config is already in env.sh" && exit
 
   wd=`pwd`
   wd=`readlink -f $wd || pwd`
 
   echo "export LIBLBFGS=$wd/liblbfgs-1.10"
-  echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH}':'${LIBLBFGS}'/lib/.libs
+  echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
 ) >> env.sh
 
diff --git a/tools/extras/install_mpg123.sh b/tools/extras/install_mpg123.sh
index 870275c6a10..5702ff476b4 100755
--- a/tools/extras/install_mpg123.sh
+++ b/tools/extras/install_mpg123.sh
@@ -55,13 +55,13 @@ ln -s mpg123-1.21.0  mpg123
 
 (
   set +u
-  [ ! -z ${MPG123} ] && \
+  [ ! -z "${MPG123}" ] && \
     echo >&2 "MPG123 variable is aleady defined. Undefining..." && \
     unset MPG123
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${MPG123} ] && \
+  [ ! -z "${MPG123}" ] && \
     echo >&2 "MPG123 config is already in env.sh" && exit
 
   wd=`pwd`
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index 50ec7e98b5e..ba6d028edad 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -76,13 +76,13 @@ cd ../
 
 (
   set +u
-  [ ! -z ${SEQUITUR} ] && \
+  [ ! -z "${SEQUITUR}" ] && \
     echo >&2 "SEQUITUR variable is aleady defined. Undefining..." && \
     unset SEQUITUR
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${SEQUITUR} ] && \
+  [ ! -z "${SEQUITUR}" ] && \
     echo >&2 "SEQUITUR config is already in env.sh" && exit
 
   wd=`pwd`
@@ -91,7 +91,7 @@ cd ../
   echo "export SEQUITUR=$wd/sequitur-g2p"
   echo "export PATH=\$PATH:\${SEQUITUR}/bin"
   echo "_site_packages=\`find \${SEQUITUR}/lib -type d -regex '.*python.*/site-packages'\`"
-  echo "export PYTHONPATH=\$PYTHONPATH:\$_site_packages"
+  echo "export PYTHONPATH=\${PYTHONPATH:-}:\$_site_packages"
 ) >> env.sh
 
 echo >&2 "Installation of SEQUITUR finished successfully"
diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh
index 5d709e8a38b..000b1dbe6c5 100755
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@@ -61,13 +61,13 @@ make || exit 1
 
 cd ..
 (
-  [ ! -z ${SRILM} ] && \
+  [ ! -z "${SRILM}" ] && \
     echo >&2 "SRILM variable is aleady defined. Undefining..." && \
     unset SRILM
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${SRILM} ] && \
+  [ ! -z "${SRILM}" ] && \
     echo >&2 "SRILM config is already in env.sh" && exit
 
   wd=`pwd`

From 6ed39968f2098b5a673c543305a7a786671f952e Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Fri, 24 Feb 2017 12:35:03 -0500
Subject: [PATCH 168/213] upgrade the sox calls to use more compatible options
 (#1453)

---
 egs/ami/s5/local/ami_ihm_data_prep.sh          | 2 +-
 egs/ami/s5/local/ami_ihm_scoring_data_prep.sh  | 2 +-
 egs/ami/s5/local/ami_mdm_data_prep.sh          | 2 +-
 egs/ami/s5/local/ami_mdm_scoring_data_prep.sh  | 2 +-
 egs/ami/s5/local/ami_sdm_data_prep.sh          | 2 +-
 egs/ami/s5/local/ami_sdm_scoring_data_prep.sh  | 2 +-
 egs/ami/s5b/local/ami_ihm_data_prep.sh         | 2 +-
 egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh | 2 +-
 egs/ami/s5b/local/ami_mdm_data_prep.sh         | 2 +-
 egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh | 2 +-
 egs/ami/s5b/local/ami_sdm_data_prep.sh         | 2 +-
 egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/egs/ami/s5/local/ami_ihm_data_prep.sh b/egs/ami/s5/local/ami_ihm_data_prep.sh
index 3a1d43d1ea1..b3ec1723713 100755
--- a/egs/ami/s5/local/ami_ihm_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_data_prep.sh
@@ -69,7 +69,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
index c3b9914d7a0..b69732a61eb 100755
--- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
@@ -68,7 +68,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5/local/ami_mdm_data_prep.sh b/egs/ami/s5/local/ami_mdm_data_prep.sh
index bc7e4180b4a..2cc973cb2d5 100755
--- a/egs/ami/s5/local/ami_mdm_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_data_prep.sh
@@ -75,7 +75,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 #prep reco2file_and_channel
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
index ab0fd185f70..8d9e24a9838 100755
--- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
@@ -67,7 +67,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp >  $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
diff --git a/egs/ami/s5/local/ami_sdm_data_prep.sh b/egs/ami/s5/local/ami_sdm_data_prep.sh
index 8eda00f1d15..e662759a610 100755
--- a/egs/ami/s5/local/ami_sdm_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_data_prep.sh
@@ -74,7 +74,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select a single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # this file reco2file_and_channel maps recording-id
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
index 01173d2e3a6..3fa7c938479 100755
--- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
@@ -72,7 +72,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp > $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
diff --git a/egs/ami/s5b/local/ami_ihm_data_prep.sh b/egs/ami/s5b/local/ami_ihm_data_prep.sh
index 38f14023b16..8ffa1f1e9c5 100755
--- a/egs/ami/s5b/local/ami_ihm_data_prep.sh
+++ b/egs/ami/s5b/local/ami_ihm_data_prep.sh
@@ -75,7 +75,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
index 3ae42afb3d8..746c42c4c1a 100755
--- a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
@@ -74,7 +74,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5b/local/ami_mdm_data_prep.sh b/egs/ami/s5b/local/ami_mdm_data_prep.sh
index 0ab11c5893b..d100347a356 100755
--- a/egs/ami/s5b/local/ami_mdm_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_data_prep.sh
@@ -79,7 +79,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 #prep reco2file_and_channel
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
index 4fbfe12ccad..65f514f223c 100755
--- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
@@ -72,7 +72,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp >  $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
diff --git a/egs/ami/s5b/local/ami_sdm_data_prep.sh b/egs/ami/s5b/local/ami_sdm_data_prep.sh
index 267aef75535..327595070a6 100755
--- a/egs/ami/s5b/local/ami_sdm_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_data_prep.sh
@@ -86,7 +86,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select a single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # this file reco2file_and_channel maps recording-id
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
index d0609e552cd..1378f8b8965 100755
--- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
@@ -82,7 +82,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp > $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \

From efe107473b1a3f30a7aa963420d0f24f51fbd879 Mon Sep 17 00:00:00 2001
From: meixu song <meixu.asr@gmail.com>
Date: Sat, 25 Feb 2017 13:39:15 +0800
Subject: [PATCH 169/213] [egs] fix typo in
 egs/swbd/s5c/local/nnet3/run_ivector_common.sh  (#1452)

---
 egs/swbd/s5c/local/nnet3/run_ivector_common.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
index 9768d82c806..b64d3e468df 100755
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -62,7 +62,7 @@ if [ $stage -le 3 ]; then
   for dataset in $train_set train_100k_nodup; do
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
 
-    utils/data/perturb_data_dir_volume.sh adata/${dataset}_hires
+    utils/data/perturb_data_dir_volume.sh data/${dataset}_hires
 
     steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
         --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;

From 0f839a5f88d41a90f01eabdef2706d2966030675 Mon Sep 17 00:00:00 2001
From: meixu song <meixu.asr@gmail.com>
Date: Sat, 25 Feb 2017 13:40:28 +0800
Subject: [PATCH 170/213] [scripts] xconfig: make scripts work when LDA-like
 preconditioning layer is not used (#1447)

---
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 20 +++++++++++++------
 .../steps/libs/nnet3/xconfig/basic_layers.py  | 15 +++++++-------
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  4 ++--
 egs/wsj/s5/steps/nnet3/xconfig_to_configs.py  | 17 ++++++++++++++++
 4 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 6d212bc5d49..54d4b0c3faa 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -440,12 +440,20 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts,
 
 
 def prepare_initial_network(dir, run_opts, srand=-3):
-    common_lib.run_job(
-        """{command} {dir}/log/add_first_layer.log \
-                nnet3-init --srand={srand} {dir}/init.raw \
-                {dir}/configs/layer1.config {dir}/0.raw""".format(
-                    command=run_opts.command, srand=srand,
-                    dir=dir))
+    if os.path.exists(dir+"/configs/init.config"):
+        common_lib.run_job(
+            """{command} {dir}/log/add_first_layer.log \
+                    nnet3-init --srand={srand} {dir}/init.raw \
+                    {dir}/configs/layer1.config {dir}/0.raw""".format(
+                        command=run_opts.command, srand=srand,
+                        dir=dir))
+    else:
+        common_lib.run_job(
+            """{command} {dir}/log/add_first_layer.log \
+                    nnet3-init --srand={srand} \
+                    {dir}/configs/layer1.config {dir}/0.raw""".format(
+                        command=run_opts.command, srand=srand,
+                        dir=dir))
 
 
 def verify_iterations(num_iters, num_epochs, num_hidden_layers,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 1a42c86ad81..de4c4af9df8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -786,7 +786,8 @@ def set_default_configs(self):
         # the most recent layer.
         self.config = { 'input':'[-1]',
                         'dim':-1,
-                        'affine-transform-file':''}
+                        'affine-transform-file':'',
+                        'write-init-config':True}
 
     def check_configs(self):
         if self.config['affine-transform-file'] is None:
@@ -817,12 +818,12 @@ def get_full_config(self):
         output_dim = self.output_dim()
         transform_file = self.config['affine-transform-file']
 
-
-        # to init.config we write an output-node with the name 'output' and
-        # with a Descriptor equal to the descriptor that's the input to this
-        # layer.  This will be used to accumulate stats to learn the LDA transform.
-        line = 'output-node name=output input={0}'.format(descriptor_final_string)
-        ans.append(('init', line))
+        if self.config['write-init-config']:
+            # to init.config we write an output-node with the name 'output' and
+            # with a Descriptor equal to the descriptor that's the input to this
+            # layer.  This will be used to accumulate stats to learn the LDA transform.
+            line = 'output-node name=output input={0}'.format(descriptor_final_string)
+            ans.append(('init', line))
 
         # write the 'real' component to final.config
         line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index f8a4cb6c861..7f52d9f8f26 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -199,7 +199,7 @@ def train(args, run_opts, background_process_handler):
     # we do this as it's a convenient way to get the stats for the 'lda-like'
     # transform.
 
-    if (args.stage <= -5):
+    if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"):
         logger.info("Initializing a basic network for estimating "
                     "preconditioning matrix")
         common_lib.run_job(
@@ -245,7 +245,7 @@ def train(args, run_opts, background_process_handler):
     # use during decoding
     common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
-    if (args.stage <= -3):
+    if (args.stage <= -3) and os.path.exists(args.dir+"/configs/init.config"):
         logger.info('Computing the preconditioning matrix for input features')
 
         train_lib.common.compute_preconditioning_matrix(
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index d3abb82c92c..5184b6eed41 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -170,7 +170,24 @@ def write_config_files(config_dir, all_layers):
             # preserves the backtrace
             raise
 
+    # remove previous init.config
+    try:
+        os.remove(config_dir + '/init.config')
+    except OSError:
+        pass
+
     for basename, lines in config_basename_to_lines.items():
+        # check the lines num start with 'output-node':
+        num_output_node_lines = sum( [ 1 if line.startswith('output-node' ) else 0
+                                       for line in lines ] )
+        if num_output_node_lines == 0:
+            if basename == 'init':
+                continue # do not write the init.config
+            else:
+                print('{0}: error in xconfig file {1}: may be lack of a output layer'.format(
+                    sys.argv[0], sys.argv[2]), file=sys.stderr)
+                raise
+
         header = config_basename_to_header[basename]
         filename = '{0}/{1}.config'.format(config_dir, basename)
         try:

From d6f6892c027a03f9ddefb86de234ae7daa4cbedf Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sat, 25 Feb 2017 12:09:24 -0800
Subject: [PATCH 171/213] [build] Update OpenFst minimum version check to 1.6
 in tools/Makefile. (#1455)

---
 tools/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index f40a75da5f8..b3d5a6c53b9 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -10,9 +10,9 @@ CC = gcc         # used for sph2pipe
 OPENFST_VERSION = 1.6.1
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10503)","1")
+ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10600)","1")
     $(error OpenFst-$(OPENFST_VERSION) is not supported. \
-            Supported versions: >= 1.5.3)
+            Supported versions: >= 1.6.0)
 endif
 
 all: check_required_programs sph2pipe atlas sclite openfst

From 5ddf62a4efb47aebaaa3c4b110f901a8474869ce Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 25 Feb 2017 17:19:33 -0500
Subject: [PATCH 172/213] [src] Fix bug in decodable-online-looped.cc (prevent
 crash in nnet3 online decoding).

---
 src/nnet3/decodable-online-looped.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
index 77be1f166bf..f231a2d5b62 100644
--- a/src/nnet3/decodable-online-looped.cc
+++ b/src/nnet3/decodable-online-looped.cc
@@ -129,7 +129,7 @@ void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
   int32 num_feature_frames_ready = input_features_->NumFramesReady();
   bool is_finished = input_features_->IsLastFrame(num_feature_frames_ready - 1);
 
-  if (end_input_frame >= num_feature_frames_ready && !is_finished) {
+  if (end_input_frame > num_feature_frames_ready && !is_finished) {
     // we shouldn't be attempting to read past the end of the available features
     // until we have reached the end of the input (i.e. the end-user called
     // InputFinished(), announcing that there is no more waveform; at this point

From 21abc6ff01a3a13959596bd14f76f61974d240d3 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Sat, 25 Feb 2017 21:02:35 -0500
Subject: [PATCH 173/213] [egs] fix MacOSX incompatibilities in calls of paste
 (#1457)

---
 egs/iban/s5/local/train_lms_srilm.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/iban/s5/local/train_lms_srilm.sh b/egs/iban/s5/local/train_lms_srilm.sh
index 9ed88842650..f72596e750a 100755
--- a/egs/iban/s5/local/train_lms_srilm.sh
+++ b/egs/iban/s5/local/train_lms_srilm.sh
@@ -206,9 +206,9 @@ echo "--------------------"
 echo "Computing perplexity"
 echo "--------------------"
 (
-  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
-  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
-  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
 )  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
 
 echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "

From dade1b0a20f89b8ae7b6a978c1b3bebc6207efe2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 27 Feb 2017 02:11:21 -0500
Subject: [PATCH 174/213] [egs,scripts] Misc script fixes; refactor wsj/s5
 examples; update tedlium/s5_r2 (#1456)

---
 egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh   | 198 ------
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   4 +-
 egs/tedlium/s5_r2/local/nnet3/compare_wer.sh  |  25 +-
 .../s5_r2/local/nnet3/run_ivector_common.sh   |   4 +-
 egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh     |   1 +
 .../s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh    |   1 +
 .../s5_r2/local/nnet3/tuning/run_tdnn_1b.sh   |   3 +
 .../s5_r2/local/nnet3/tuning/run_tdnn_1c.sh   | 186 ++++++
 .../local/nnet3/tuning/run_tdnn_lfr_1a.sh     | 200 ++++++
 .../local/nnet3/tuning/run_tdnn_lstm_1a.sh    |  31 +-
 .../local/nnet3/tuning/run_tdnn_lstm_1c.sh    |   2 +
 .../nnet3/tuning/run_tdnn_lstm_lfr_1a.sh      | 310 +++++++++
 egs/wsj/s5/RESULTS                            | 176 +++--
 egs/wsj/s5/local/chain/compare_wer.sh         | 137 ++++
 egs/wsj/s5/local/chain/run_tdnn.sh            |   1 +
 egs/wsj/s5/local/chain/run_tdnn_lstm.sh       |   1 +
 egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh  | 361 ++++++++++
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh | 393 +++++++++++
 egs/wsj/s5/local/nnet3/compare_wer.sh         | 139 ++++
 egs/wsj/s5/local/nnet3/run_ivector_common.sh  | 228 +++++--
 egs/wsj/s5/local/nnet3/run_lstm.sh            |   3 +-
 .../s5/local/nnet3/run_lstm_discriminative.sh |   3 +
 egs/wsj/s5/local/nnet3/run_tdnn.sh            |  75 +--
 egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh   |  79 ---
 egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh       |   1 +
 egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh   |   1 +
 egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh  | 162 +++++
 egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh  | 168 +++++
 .../s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh | 288 ++++++++
 .../nnet3/tuning/run_tdnn_lstm_lfr_1a.sh      | 335 ++++++++++
 egs/wsj/s5/local/run_basis_fmllr.sh           |   6 +-
 egs/wsj/s5/local/run_mmi_tri2b.sh             |  73 ---
 egs/wsj/s5/run.sh                             | 616 +++++++++---------
 .../s5/steps/libs/nnet3/report/log_parse.py   |  14 +-
 egs/wsj/s5/steps/lmrescore.sh                 |   2 +-
 egs/wsj/s5/steps/mixup.sh                     | 153 -----
 egs/wsj/s5/utils/fix_data_dir.sh              |   9 +-
 egs/wsj/s5/utils/mkgraph.sh                   |   2 +-
 egs/wsj/s5/utils/validate_data_dir.sh         |   2 +-
 39 files changed, 3310 insertions(+), 1083 deletions(-)
 delete mode 100755 egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
 create mode 120000 egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
 create mode 120000 egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
 create mode 100755 egs/wsj/s5/local/chain/compare_wer.sh
 create mode 120000 egs/wsj/s5/local/chain/run_tdnn.sh
 create mode 120000 egs/wsj/s5/local/chain/run_tdnn_lstm.sh
 create mode 100755 egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
 create mode 100755 egs/wsj/s5/local/nnet3/compare_wer.sh
 mode change 100755 => 120000 egs/wsj/s5/local/nnet3/run_tdnn.sh
 delete mode 100755 egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
 create mode 120000 egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh
 create mode 120000 egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh
 create mode 100755 egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh
 create mode 100755 egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh
 create mode 100755 egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
 create mode 100755 egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
 delete mode 100755 egs/wsj/s5/local/run_mmi_tri2b.sh
 delete mode 100755 egs/wsj/s5/steps/mixup.sh

diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
deleted file mode 100755
index 9e795316352..00000000000
--- a/egs/tedlium/s5_r2/local/chain/run_tdnn_d.sh
+++ /dev/null
@@ -1,198 +0,0 @@
-#!/bin/bash
-
-# by default, with cleanup:
-# local/chain/run_tdnn.sh
-
-# without cleanup:
-# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run the corresponding non-chain nnet3 system
-# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-
-  echo "$0: creating neural net configs";
-
-  steps/nnet3/tdnn/make_configs.py \
-    --self-repair-scale-nonlinearity 0.00001 \
-    --feat-dir data/${train_set}_sp_hires_comb \
-    --ivector-dir $train_ivector_dir \
-    --tree-dir $tree_dir \
-    --relu-dim 550 \
-    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
-    --use-presoftmax-prior-scale false \
-    --xent-regularize 0.1 \
-    --xent-separate-forward-affine true \
-    --include-log-softmax false \
-    --final-layer-normalize-target 1.0 \
-   $dir/configs || exit 1;
-fi
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index 6704f9d299e..e56946c1b54 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -259,14 +259,14 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
   fi
 
  steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
+    --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient 0.1 \
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
index 3e14a4efc55..da0bb728e69 100755
--- a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
@@ -1,12 +1,20 @@
 #!/bin/bash
 
 # this script is used for comparing decoding results between systems.
-# e.g. local/nnet3/compare_wer_general.sh exp/nnet3_cleaned/tdnn_{c,d}_sp
+# e.g. local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_{c,d}_sp
 # For use with discriminatively trained systems you specify the epochs after a colon:
 # for instance,
 # local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_c_sp exp/nnet3_cleaned/tdnn_c_sp_smbr:{1,2,3}
 
 
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3_cleaned/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/nnet3_cleaned/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
 echo "# $0 $*"
 
 include_looped=false
@@ -14,6 +22,11 @@ if [ "$1" == "--looped" ]; then
   include_looped=true
   shift
 fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
 
 
 
@@ -71,6 +84,16 @@ for n in 0 1 2 3; do
      done
      echo
    fi
+   if $include_online; then
+     echo -n "#         [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum ${dirname}_online/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
 done
 
 
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
index b4f2dd3e3b4..16093616b05 100755
--- a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
@@ -21,9 +21,9 @@ num_threads_ubm=32
 nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/nnet3_cleaned or whatever.
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
 
 
 gmm_dir=exp/${gmm}
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh
new file mode 120000
index 00000000000..8e03c924bc1
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_lfr_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
index 379c8040a27..f6e4fb71b75 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+
+# 1b is as 1a but uses xconfigs.
+
 #    This is the standard "tdnn" system, built in nnet3; this script
 # is the version that's meant to run with data-cleanup, that doesn't
 # support parallel alignments.
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..35789342ffb
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# 1c is as 1b but using more 'chain-like' splicing and slightly
+# smaller dim.  Not better; maybe slightly worse.
+
+# note: the num-params is almost the same.
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1{b,c}_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+# exp/nnet3_cleaned/tdnn1c_sp: num-iters=240 nj=2..12 num-params=10.1M dim=40+100->4187 combine=-1.16->-1.15 loglike:train/valid[159,239,combined]=(-1.22,-1.16,-1.15/-1.41,-1.38,-1.38) accuracy:train/valid[159,239,combined]=(0.66,0.67,0.68/0.62,0.63,0.63)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1{b,c}_sp
+# System                tdnn1b_sp tdnn1c_sp
+# WER on dev(orig)           11.7      11.9
+# WER on dev(rescored)       10.9      11.1
+# WER on test(orig)          11.7      11.8
+# WER on test(rescored)      11.0      11.2
+# Final train prob        -0.9416   -1.1505
+# Final valid prob        -1.1496   -1.3805
+# Final train acc          0.7241    0.6756
+# Final valid acc          0.6788    0.6255
+
+#    This is the standard "tdnn" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1b_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+# WER on dev(orig)           11.9      11.7
+# WER on dev(rescored)       11.2      10.9
+# WER on test(orig)          11.6      11.7
+# WER on test(rescored)      11.0      11.0
+# Final train prob        -0.9255   -0.9416
+# Final valid prob        -1.1842   -1.1496
+# Final train acc          0.7245    0.7241
+# Final valid acc          0.6771    0.6788
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1c  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
new file mode 100755
index 00000000000..666c2f1bb31
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+
+
+# run_tdnn_lfr_1a.sh is similar in configuration to run_tdnn_1c.sh, but it's a
+# low-frame-rate system (see egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
+# for an example of such a system).
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lfr.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lfr.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1a  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lfr${tdnn_affix}_sp
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \
+    $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+fi
+
+
+if [ $stage -le 17 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+     steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \
+        --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
index f1502dd2761..28c45836cf7 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -9,15 +9,16 @@
 # System                tdnn_lstm1a_sp tdnn_lstm1b_sp
 # WER on dev(orig)           11.0      11.0
 #         [looped:]          11.0      11.1
-# WER on dev(rescored)       10.3      10.3
+# WER on dev(rescored)       10.4      10.3
 #         [looped:]          10.3      10.5
-# WER on test(orig)          10.8      10.6
+# WER on test(orig)          10.7      10.6
 #         [looped:]          10.7      10.7
 # WER on test(rescored)      10.1       9.9
 #         [looped:]          10.0      10.0
-# Final train prob     -0.68810.7954-0.68970.7946
-# Final valid prob     -0.77960.7611-0.79890.7582
-
+# Final train prob        -0.6881   -0.6897
+# Final valid prob        -0.7796   -0.7989
+# Final train acc          0.7954    0.7946
+# Final valid acc          0.7611    0.7582
 
 # by default, with cleanup:
 # local/nnet3/run_tdnn_lstm.sh
@@ -53,19 +54,11 @@ label_delay=5
 chunk_width=40,30,20
 chunk_left_context=40
 chunk_right_context=0
-# decode chunk-size options (for non-looped decoding)
-extra_left_context=50
-extra_right_context=0
 
 # training options
 srand=0
 remove_egs=true
 
-#decode options
-extra_left_context=
-extra_right_context=
-frames_per_chunk=
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@@ -91,8 +84,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 gmm_dir=exp/${gmm}
 graph_dir=$gmm_dir/graph
 ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}
-dir=${dir}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
 train_data_dir=data/${train_set}_sp_hires_comb
 train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
 
@@ -175,15 +167,14 @@ if [ $stage -le 13 ]; then
 fi
 
 if [ $stage -le 14 ]; then
-  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
-  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
-  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
   for dset in dev test; do
    (
     steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
-        --extra-left-context $extra_left_context \
-        --extra-right-context $extra_right_context \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --frames-per-chunk $frames_per_chunk \
         --extra-left-context-initial 0 --extra-right-context-final 0 \
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
       ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
index 1d3b12f2697..bc9a717419d 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
@@ -8,6 +8,8 @@
 # local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
 # with 1.5 times larger hidden dimensions.
 
+# exp/nnet3_cleaned/tdnn_lstm1c_sp: num-iters=246 nj=3..15 num-params=18.7M dim=40+100->4187 combine=-0.67->-0.66 loglike:train/valid[163,245,combined]=(-0.71,-0.63,-0.60/-0.92,-0.88,-0.85) accuracy:train/valid[163,245,combined]=(0.77,0.79,0.80/0.74,0.75,0.75)
+
 # local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp exp/nnet3_cleaned/tdnn_lstm1c_sp
 # System                tdnn_lstm1a_sp tdnn_lstm1b_sp tdnn_lstm1c_sp
 # WER on dev(orig)           11.0      11.0      11.0
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
new file mode 100755
index 00000000000..3e8509bf4ac
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_lfr_1a.sh is like run_tdnn_lstm_1a.sh, but
+# it's a low-frame-rate system. (however, using num-jobs-final=10,
+# not 15, which was very high).
+
+
+# Generally the WER is the same or slightly better than before.
+
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp  2>/dev/null
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp
+# System                tdnn_lstm1c_sp tdnn_lstm_lfr1a_sp
+# WER on dev(orig)           11.0      10.9
+#         [looped:]          10.9      10.9
+#         [online:]                    10.8
+# WER on dev(rescored)       10.4      10.3
+#         [looped:]          10.3      10.3
+#         [online:]                    10.3
+# WER on test(orig)          10.8      10.7
+#         [looped:]          10.7      10.7
+#         [online:]                    10.7
+# WER on test(rescored)      10.1      10.2
+#         [looped:]          10.1      10.1
+#         [online:]                    10.2
+# Final train prob        -0.5998   -0.5437
+# Final valid prob        -0.8542   -0.7286
+# Final train acc          0.7988    0.8343
+# Final valid acc          0.7521    0.7888
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm_lfr.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm_lfr.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm_lfr${affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \
+    $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 17 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+     steps/nnet3/decode.sh \
+       --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \
+       --cmd "$decode_cmd"  --num-threads 4 \
+       --extra-left-context $chunk_left_context \
+       --extra-right-context $chunk_right_context \
+       --frames-per-chunk $frames_per_chunk \
+       --extra-left-context-initial 0 --extra-right-context-final 0 \
+       --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 18 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+        steps/nnet3/decode_looped.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS
index acff4f9d7fe..e6732d21074 100644
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@@ -1,8 +1,15 @@
 #!/bin/bash
 
-# this RESULTS file was obtained by Haihua Xu in July 2013.
-
-for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+# this RESULTS file was obtained by Dan Povey in Feb 2017, after
+# a rewrite of the run.sh file.
+# To see results from the scripts local/nnet3/ and local/chain/,
+# look at the top of those files, we don't put those in the
+# RESULTS file.
+
+for dir in exp/*; do
+  steps/info/gmm_dir_info.pl $dir
+  for x in $dir/decode*dev93* $dir/decode*eval92*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+done
 exit 0
 
 # Use caution when comparing these results with other published results.
@@ -13,107 +20,76 @@ exit 0
 # in which we only test on utterances that are in either a 5k or 20k subset
 # of the vocabulary.
 
-# The following results are updated with LDA+MLLT to use 7, not 9 frames of context,
-# and also increased the learning rate for the "indirect" fMMI.
-
 # monophone, deltas, trained on the 2k shortest utterances from the si84 data.
-%WER 35.39 [ 2914 / 8234, 284 ins, 467 del, 2163 sub ] exp/mono0a/decode_tgpr_dev93/wer_10
-%WER 25.78 [ 1455 / 5643, 142 ins, 184 del, 1129 sub ] exp/mono0a/decode_tgpr_eval92/wer_9
+exp/mono0a: nj=10 align prob=-95.82 over 2.36h [retry=0.4%, fail=0.0%] states=132 gauss=973
+%WER 34.33 [ 2827 / 8234, 266 ins, 457 del, 2104 sub ] exp/mono0a/decode_nosp_tgpr_dev93/wer_10_0.0
+%WER 25.13 [ 1418 / 5643, 138 ins, 192 del, 1088 sub ] exp/mono0a/decode_nosp_tgpr_eval92/wer_10_0.0
+
+
 
 # first triphone build.  Built on half of SI-84.
-%WER 20.00 [ 1647 / 8234, 257 ins, 197 del, 1193 sub ] exp/tri1/decode_tgpr_dev93/wer_17
-%WER 13.04 [ 736 / 5643, 137 ins, 61 del, 538 sub ] exp/tri1/decode_tgpr_eval92/wer_14
+exp/tri1: nj=10 align prob=-93.75 over 7.38h [retry=0.4%, fail=0.0%] states=1567 gauss=10025 tree-impr=5.06
+%WER 19.40 [ 1597 / 8234, 247 ins, 199 del, 1151 sub ] exp/tri1/decode_nosp_tgpr_dev93/wer_14_0.5
+%WER 12.76 [ 720 / 5643, 110 ins, 89 del, 521 sub ] exp/tri1/decode_nosp_tgpr_eval92/wer_14_1.0
 
-# the same, rescored with full trigram model [not pruned.]  Note: the tg{1,2,3,4} are
+# the above, rescored with full trigram model [not pruned.]  Note: the tg{1,2,3,4} are
 # different rescoring methods.  They all give about the same results.  Note: 3 and 4 give
 # the "correct" LM scores.
-%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg1/wer_14
-%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg2/wer_14
-%WER 18.75 [ 1544 / 8234, 266 ins, 152 del, 1126 sub ] exp/tri1/decode_tgpr_dev93_tg3/wer_15
-%WER 18.76 [ 1545 / 8234, 266 ins, 152 del, 1127 sub ] exp/tri1/decode_tgpr_dev93_tg4/wer_15
-
-# tri2a is delta+delta-delta features.
-%WER 17.93 [ 1476 / 8234, 256 ins, 161 del, 1059 sub ] exp/tri2a/decode_tgpr_dev93/wer_16
-%WER 12.42 [ 701 / 5643, 132 ins, 64 del, 505 sub ] exp/tri2a/decode_tgpr_eval92/wer_15
-# just demonstrates how to do decoding constrained by lattices.
-%WER 16.76 [ 1380 / 8234, 275 ins, 132 del, 973 sub ] exp/tri2a/decode_tgpr_dev93_fromlats/wer_16
-
-# This is an LDA+MLLT system. 
-%WER 16.43 [ 1353 / 8234, 241 ins, 162 del, 950 sub ] exp/tri2b/decode_tgpr_dev93/wer_16
-%WER 10.69 [ 603 / 5643, 154 ins, 47 del, 402 sub ] exp/tri2b/decode_tgpr_eval92/wer_14
-
-# rescoring the lattices with trigram.
-%WER 15.29 [ 1252 / 8191, 219 ins, 153 del, 880 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg/wer_18
-# using the "biglm" decoding method to avoid the lattice rescoring step [not faster though.]
-%WER 15.31 [ 1261 / 8234, 227 ins, 158 del, 876 sub ] exp/tri2b/decode_tgpr_dev93_tg_biglm/wer_18
-# using a Minimum Bayes Risk decoding method on top of the _tg lattices.
-%WER 15.15 [ 1241 / 8191, 221 ins, 155 del, 865 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg_mbr/wer_18
-
-# fMMI, default learning rate (0.001)
-
-%WER 15.19 [ 1251 / 8234, 213 ins, 148 del, 890 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.14 [ 1247 / 8234, 228 ins, 138 del, 881 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it4/wer_14
-%WER 15.06 [ 1240 / 8234, 211 ins, 152 del, 877 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it5/wer_15
-%WER 15.01 [ 1236 / 8234, 206 ins, 154 del, 876 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it6/wer_15
-%WER 14.99 [ 1234 / 8234, 210 ins, 159 del, 865 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it7/wer_15
-%WER 15.23 [ 1254 / 8234, 200 ins, 184 del, 870 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it8/wer_16
-
-%WER 15.55 [ 1280 / 8234, 234 ins, 151 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it3/wer_15
-%WER 15.63 [ 1287 / 8234, 242 ins, 150 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it4/wer_15
-%WER 15.30 [ 1260 / 8234, 224 ins, 143 del, 893 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it5/wer_15
-%WER 15.34 [ 1263 / 8234, 216 ins, 156 del, 891 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it6/wer_16
-%WER 15.34 [ 1263 / 8234, 242 ins, 139 del, 882 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it7/wer_14
-%WER 15.30 [ 1260 / 8234, 245 ins, 134 del, 881 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it8/wer_13
-
-%WER 15.21 [ 1252 / 8234, 218 ins, 148 del, 886 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.16 [ 1248 / 8234, 205 ins, 159 del, 884 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it4/wer_16
-%WER 15.22 [ 1253 / 8234, 229 ins, 147 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it5/wer_15
-%WER 14.90 [ 1227 / 8234, 203 ins, 150 del, 874 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it6/wer_15
-%WER 14.95 [ 1231 / 8234, 202 ins, 152 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it7/wer_15
-%WER 15.18 [ 1250 / 8234, 184 ins, 172 del, 894 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it8/wer_16
-
-%WER 15.70 [ 1293 / 8234, 218 ins, 163 del, 912 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it3/wer_16
-%WER 15.61 [ 1285 / 8234, 217 ins, 163 del, 905 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it4/wer_16
-%WER 10.46 [ 590 / 5643, 125 ins, 51 del, 414 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it3/wer_15
-%WER 10.40 [ 587 / 5643, 124 ins, 52 del, 411 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it4/wer_16
-
-%WER 15.56 [ 1281 / 8234, 224 ins, 152 del, 905 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.44 [ 1271 / 8234, 220 ins, 165 del, 886 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it4/wer_16
-%WER 10.33 [ 583 / 5643, 125 ins, 51 del, 407 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it3/wer_15
-%WER 10.33 [ 583 / 5643, 125 ins, 47 del, 411 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it4/wer_15
-
-%WER 11.43 [ 941 / 8234, 113 ins, 144 del, 684 sub ] exp/tri3b/decode_bd_tgpr_dev93/wer_19
-%WER 16.09 [ 1325 / 8234, 193 ins, 185 del, 947 sub ] exp/tri3b/decode_bd_tgpr_dev93.si/wer_16
-%WER 6.79 [ 383 / 5643, 51 ins, 49 del, 283 sub ] exp/tri3b/decode_bd_tgpr_eval92/wer_18
-%WER 10.61 [ 599 / 5643, 91 ins, 74 del, 434 sub ] exp/tri3b/decode_bd_tgpr_eval92.si/wer_15
-%WER 5.74 [ 324 / 5643, 46 ins, 41 del, 237 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_19
-%WER 5.90 [ 333 / 5643, 46 ins, 39 del, 248 sub ] exp/tri3b/decode_bd_tgpr_eval92_tg/wer_18
-
-%WER 14.17 [ 1167 / 8234, 222 ins, 123 del, 822 sub ] exp/tri3b/decode_tgpr_dev93/wer_17
-%WER 19.37 [ 1595 / 8234, 315 ins, 153 del, 1127 sub ] exp/tri3b/decode_tgpr_dev93.si/wer_15
-
-%WER 12.98 [ 1069 / 8234, 209 ins, 116 del, 744 sub ] exp/tri3b/decode_tgpr_dev93_tg/wer_19
-%WER 9.30 [ 525 / 5643, 120 ins, 37 del, 368 sub ] exp/tri3b/decode_tgpr_eval92/wer_18
-%WER 12.95 [ 731 / 5643, 167 ins, 46 del, 518 sub ] exp/tri3b/decode_tgpr_eval92.si/wer_14
-%WER 8.54 [ 482 / 5643, 113 ins, 29 del, 340 sub ] exp/tri3b/decode_tgpr_eval92_tg/wer_17
-
-%WER 12.12 [ 998 / 8234, 209 ins, 88 del, 701 sub ] exp/tri4a/decode_tgpr_dev93/wer_17
-%WER 15.98 [ 1316 / 8234, 275 ins, 119 del, 922 sub ] exp/tri4a/decode_tgpr_dev93.si/wer_15
-%WER 7.83 [ 442 / 5643, 107 ins, 23 del, 312 sub ] exp/tri4a/decode_tgpr_eval92/wer_16
-%WER 10.90 [ 615 / 5643, 148 ins, 30 del, 437 sub ] exp/tri4a/decode_tgpr_eval92.si/wer_13
-
-%WER 9.15 [ 753 / 8234, 90 ins, 113 del, 550 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93/wer_16
-%WER 12.64 [ 1041 / 8234, 137 ins, 145 del, 759 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93.si/wer_16
-%WER 5.74 [ 324 / 5643, 47 ins, 35 del, 242 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92/wer_19
-%WER 7.92 [ 447 / 5643, 64 ins, 46 del, 337 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92.si/wer_15
-%WER 9.38 [ 772 / 8234, 90 ins, 118 del, 564 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_18
-%WER 13.07 [ 1076 / 8234, 148 ins, 143 del, 785 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17
-%WER 6.03 [ 340 / 5643, 66 ins, 26 del, 248 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_13
-%WER 8.19 [ 462 / 5643, 74 ins, 42 del, 346 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15
-%WER 12.16 [ 1001 / 8234, 197 ins, 98 del, 706 sub ] exp/tri4b/decode_tgpr_dev93/wer_17
-%WER 15.47 [ 1274 / 8234, 235 ins, 120 del, 919 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17
-%WER 8.08 [ 456 / 5643, 125 ins, 16 del, 315 sub ] exp/tri4b/decode_tgpr_eval92/wer_13
-%WER 10.49 [ 592 / 5643, 147 ins, 27 del, 418 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_12
+%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg1/wer_15_0.5
+%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg2/wer_15_0.5
+%WER 18.16 [ 1495 / 8234, 268 ins, 153 del, 1074 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg3/wer_16_0.0
+%WER 18.18 [ 1497 / 8234, 268 ins, 154 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg4/wer_16_0.0
+
+
+# tri2b is an LDA+MLLT system trained on SI-84
+exp/tri2b: nj=10 align prob=-47.22 over 15.10h [retry=0.7%, fail=0.0%] states=2005 gauss=15036 tree-impr=5.45 lda-sum=26.20 mllt:impr,logdet=1.34,1.97
+%WER 16.37 [ 1348 / 8234, 241 ins, 157 del, 950 sub ] exp/tri2b/decode_nosp_tgpr_dev93/wer_17_0.0
+%WER 10.53 [ 594 / 5643, 110 ins, 60 del, 424 sub ] exp/tri2b/decode_nosp_tgpr_eval92/wer_17_0.5
+
+
+# tri3b is an LDA+MLLT+SAT system trained on all of SI-284
+exp/tri3b: nj=10 align prob=-44.30 over 81.23h [retry=0.8%, fail=0.1%] states=3362 gauss=40061 fmllr-impr=3.70 over 59.77h tree-impr=7.86
+
+%WER 15.56 [ 1281 / 8234, 220 ins, 140 del, 921 sub ] exp/tri3b/decode_nosp_tgpr_dev93.si/wer_17_0.5
+%WER 12.82 [ 1056 / 8234, 135 ins, 147 del, 774 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93.si/wer_15_0.0
+%WER 9.24 [ 761 / 8234, 89 ins, 109 del, 563 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93/wer_16_0.0
+%WER 11.53 [ 949 / 8234, 179 ins, 94 del, 676 sub ] exp/tri3b/decode_nosp_tgpr_dev93/wer_15_0.5
+%WER 10.94 [ 901 / 8234, 181 ins, 82 del, 638 sub ] exp/tri3b/decode_nosp_tg_dev93/wer_14_0.5
+%WER 8.16 [ 672 / 8234, 94 ins, 94 del, 484 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93_fg/wer_17_0.0
+
+%WER 10.95 [ 618 / 5643, 148 ins, 36 del, 434 sub ] exp/tri3b/decode_nosp_tgpr_eval92.si/wer_14_0.0
+%WER 8.19 [ 462 / 5643, 77 ins, 51 del, 334 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92.si/wer_16_0.0
+%WER 5.55 [ 313 / 5643, 35 ins, 45 del, 233 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92/wer_17_1.0
+%WER 4.89 [ 276 / 5643, 47 ins, 28 del, 201 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92_fg/wer_15_0.5
+%WER 7.53 [ 425 / 5643, 112 ins, 20 del, 293 sub ] exp/tri3b/decode_nosp_tg_eval92/wer_17_0.0
+%WER 8.15 [ 460 / 5643, 113 ins, 30 del, 317 sub ] exp/tri3b/decode_nosp_tgpr_eval92/wer_14_1.0
+
+
+# tri4b is an LDA+MLLT+SAT system after estimating pronunciation probabilities
+# and word-and-pronunciation-dependent silence probabilities.
+
+exp/tri4b: nj=10 align prob=-44.46 over 81.23h [retry=0.6%, fail=0.1%] states=3413 gauss=40059 fmllr-impr=0.17 over 60.20h tree-impr=8.70
+
+%WER 15.16 [ 1248 / 8234, 253 ins, 96 del, 899 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17_0.0
+%WER 12.62 [ 1039 / 8234, 141 ins, 124 del, 774 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17_0.0
+%WER 9.01 [ 742 / 8234, 106 ins, 97 del, 539 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_16_0.0
+%WER 8.25 [ 679 / 8234, 94 ins, 100 del, 485 sub ] exp/tri4b/decode_bd_tgpr_dev93_fg/wer_17_0.5
+%WER 10.92 [ 899 / 8234, 186 ins, 92 del, 621 sub ] exp/tri4b/decode_tg_dev93/wer_17_0.5
+%WER 11.44 [ 942 / 8234, 203 ins, 87 del, 652 sub ] exp/tri4b/decode_tgpr_dev93/wer_14_0.5
+
+%WER 10.93 [ 617 / 5643, 147 ins, 33 del, 437 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_14_1.0
+%WER 8.74 [ 493 / 5643, 104 ins, 34 del, 355 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15_0.0
+%WER 5.69 [ 321 / 5643, 50 ins, 34 del, 237 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_17_0.5
+%WER 4.71 [ 266 / 5643, 40 ins, 27 del, 199 sub ] exp/tri4b/decode_bd_tgpr_eval92_fg/wer_17_1.0
+%WER 7.39 [ 417 / 5643, 107 ins, 24 del, 286 sub ] exp/tri4b/decode_tg_eval92/wer_16_1.0
+%WER 7.90 [ 446 / 5643, 111 ins, 27 del, 308 sub ] exp/tri4b/decode_tgpr_eval92/wer_15_1.0
+
+
+######################################
+## Results below this point were mostly obtained in 2013 by Hainan Xu,
+## They are from parts of the script that are now not run by default in the run.sh.
+## you can look in the git history to figure out when these results were added.
+
 %WER 7.99 [ 658 / 8234, 72 ins, 95 del, 491 sub ] exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it8/wer_12
 %WER 11.15 [ 918 / 8234, 180 ins, 81 del, 657 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it3/wer_15
 %WER 11.23 [ 925 / 8234, 201 ins, 77 del, 647 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it4/wer_12
@@ -166,7 +142,7 @@ exit 0
 # not updated
 
 
-# DNN on fMLLR features (Karel's setup, [7.8.2015]). 
+# DNN on fMLLR features (Karel's setup, [7.8.2015]).
 # frame cross-entropy training
 %WER 6.05 [ 498 / 8234, 59 ins, 67 del, 372 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_dev93/wer_11_0.0
 %WER 3.69 [ 208 / 5643, 19 ins, 19 del, 170 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_eval92/wer_11_1.0
@@ -298,7 +274,7 @@ for x in exp/nnet3/nnet_tdnn_a/decode_*; do grep WER $x/wer_* | utils/best_wer.s
 
 # bidirectional LSTM
 # -----------------------
-# local/nnet3/run_lstm.sh --affix bidirectional \ 
+# local/nnet3/run_lstm.sh --affix bidirectional \
 #                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
 #	                  --label-delay 0 \
 #                         --cell-dim 640 \
diff --git a/egs/wsj/s5/local/chain/compare_wer.sh b/egs/wsj/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..edfefad547f
--- /dev/null
+++ b/egs/wsj/s5/local/chain/compare_wer.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev93 (tgpr)          "
+  "#WER dev93 (tg)            "
+  "#WER dev93 (big-dict,tgpr) "
+  "#WER dev93 (big-dict,fg)   "
+  "#WER eval92 (tgpr)         "
+  "#WER eval92 (tg)           "
+  "#WER eval92 (big-dict,tgpr)"
+  "#WER eval92 (big-dict,fg)  ")
+
+for n in 0 1 2 3 4 5 6 7; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/wsj/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..d874eb0986a
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,361 @@
+#!/bin/bash
+
+
+# This was modified from run_tdnn_lstm_1a.sh, making similar
+# changes as the diff from run_tdnn_lstm_1a.sh->run_tdnn_1c.sh
+# in egs/tedlium/s5_r2/local/nnet3/tuning,
+# specifically:
+# changing chunk_left_context to zero, shrink from 0.99->1
+# (since it's not applicable to ReLUs), and removing
+# the deriv-truncate-margin option since it's only applicable
+# to recurrent setups; removing label-delay.
+# adding pre-final layers (I experimented with this,
+# it did seem helpful); using 3M not 1.5M frames per iter to keep the
+# time per job reasonable; and fewer final jobs (5 not 10).
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1a_sp: num-iters=102 nj=2..5 num-params=7.6M dim=40+100->2889 combine=-0.052->-0.051 xent:train/valid[67,101,final]=(-0.881,-0.824,-0.822/-0.953,-0.922,-0.921) logprob:train/valid[67,101,final]=(-0.048,-0.042,-0.041/-0.064,-0.064,-0.063)
+
+# The following table compares (nnet3 TDNN, chain TDNN+LSTM, this experiment == chain TDNN).
+# This is better than the nnet3 TDNN, but the difference with the chain TDNN+LSTM
+# is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn1a_sp exp/chain/tdnn_lstm1a_sp exp/chain/tdnn1a_sp
+# System                tdnn1a_sp tdnn_lstm1a_sp tdnn1a_sp
+#WER dev93 (tgpr)                9.18      7.48      7.87
+#             [online:]                    7.49      8.02
+#WER dev93 (tg)                  8.59      7.41      7.61
+#             [online:]                    7.40      7.70
+#WER dev93 (big-dict,tgpr)       6.45      5.64      5.71
+#             [online:]                    5.70      5.60
+#WER dev93 (big-dict,fg)         5.83      5.40      5.10
+#             [online:]                    5.19      5.21
+#WER eval92 (tgpr)               6.15      5.67      5.23
+#             [online:]                    5.60      5.44
+#WER eval92 (tg)                 5.55      5.46      4.87
+#             [online:]                    5.53      4.87
+#WER eval92 (big-dict,tgpr)      3.58      3.69      3.24
+#             [online:]                    3.63      3.31
+#WER eval92 (big-dict,fg)        2.98      3.28      2.71
+#             [online:]                    3.31      2.92
+# Final train prob                  -0.0341   -0.0414
+# Final valid prob                  -0.0506   -0.0634
+# Final train prob (xent)             -0.5643   -0.8216
+# Final valid prob (xent)             -0.6648   -0.9208
+
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1d  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..4b752a55a4b
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,393 @@
+#!/bin/bash
+
+
+# this is a TDNN+LSTM chain system.
+# It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with
+# reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh.
+# Note: we're using the same hidden-layer sizes as
+# ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the
+# fact that we'd normally choose a smaller model for a setup with
+# less data, because the Tedlium model was probably on the small side.
+# Note: we normally use more parameters for LSTM-containing than TDNN-only
+# systems.
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051)
+
+# The following compares:
+# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM)
+# system.
+# This is consistently better than the nnet3 TDNN+LSTM, but the
+# difference with the chain TDNN is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp
+# System                tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                8.54      7.87      7.48
+#             [online:]          8.57      8.02      7.49
+#WER dev93 (tg)                  8.25      7.61      7.41
+#             [online:]          8.34      7.70      7.40
+#WER dev93 (big-dict,tgpr)       6.24      5.71      5.64
+#             [online:]          6.40      5.60      5.70
+#WER dev93 (big-dict,fg)         5.70      5.10      5.40
+#             [online:]          5.77      5.21      5.19
+#WER eval92 (tgpr)               6.52      5.23      5.67
+#             [online:]          6.56      5.44      5.60
+#WER eval92 (tg)                 6.13      4.87      5.46
+#             [online:]          6.24      4.87      5.53
+#WER eval92 (big-dict,tgpr)      3.88      3.24      3.69
+#             [online:]          3.88      3.31      3.63
+#WER eval92 (big-dict,fg)        3.38      2.71      3.28
+#             [online:]          3.53      2.92      3.31
+# Final train prob                  -0.0414   -0.0341
+# Final valid prob                  -0.0634   -0.0506
+# Final train prob (xent)             -0.8216   -0.5643
+# Final valid prob (xent)             -0.9208   -0.6648
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+label_delay=5
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 19 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 20 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/compare_wer.sh b/egs/wsj/s5/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..255cbabaef3
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/compare_wer.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/nnet3/compare_wer.sh exp/nnet3/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn_c_sp exp/nnet3/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/nnet3/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev93 (tgpr)          "
+  "#WER dev93 (tg)            "
+  "#WER dev93 (big-dict,tgpr) "
+  "#WER dev93 (big-dict,fg)   "
+  "#WER eval92 (tgpr)         "
+  "#WER eval92 (tg)           "
+  "#WER eval92 (big-dict,tgpr)"
+  "#WER eval92 (big-dict,fg)  ")
+
+for n in 0 1 2 3 4 5 6 7; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
index 8d4cff326b3..e30988b7bf6 100755
--- a/egs/wsj/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -1,83 +1,215 @@
 #!/bin/bash
 
-# this script is called from scripts like run_ms.sh; it does the common stages
-# of the build, such as feature extraction.
-# This is actually the same as local/online/run_nnet2_common.sh, except
-# for the directory names.
+set -e -o pipefail
 
-. cmd.sh
-mfccdir=mfcc
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
 
-stage=1
 
-. cmd.sh
+stage=0
+nj=30
+train_set=train_si284   # you might set this to e.g. train.
+test_sets="test_dev93 test_eval92"
+gmm=tri4b                # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
+                         # in the tedlium recip it's _cleaned).
+
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
 
 
 if [ $stage -le 1 ]; then
-  for datadir in train_si284 test_eval93 test_dev93 test_eval92; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-    steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-  done
-  utils/subset_data_dir.sh --first data/train_si284_hires 7138 data/train_si84_hires || exit 1
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
 fi
 
 if [ $stage -le 2 ]; then
-  # We need to build a small system just because we need the LDA+MLLT transform
-  # to train the diag-UBM on top of.  We align the si84 data for this purpose.
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 
-  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-    data/train_si84 data/lang exp/tri4b exp/nnet3/tri4b_ali_si84
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
 fi
 
 if [ $stage -le 3 ]; then
-  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
-  # because after we get the transform (12th iter is the last), any further
-  # training is pointless.
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
-    --realign-iters "" \
-    --splice-opts "--left-context=3 --right-context=3" \
-    5000 10000 data/train_si84_hires data/lang \
-     exp/nnet3/tri4b_ali_si84 exp/nnet3/tri5b
+  echo "$0: selecting segments of hires training data that were also present in the"
+  echo " ... original training data."
+
+  # note, these data-dirs are temporary; we put them in a sub-directory
+  # of the place where we'll make the alignments.
+  temp_data_root=exp/nnet3${nnet3_affix}/tri5
+  mkdir -p $temp_data_root
+
+  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
+    data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
+
+  # note: essentially all the original segments should be in the hires data.
+  n1=$(wc -l <data/${train_set}/feats.scp)
+  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
+  if [ $n1 != $n1 ]; then
+    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
+    sleep 5
+  fi
+
+  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+    # we don't want to overwrite old stuff, ask the user to delete it.
+    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+    echo " ... please delete and then rerun, or use a later --stage option."
+    exit 1;
+  fi
+  # we limit the number of iterations because it's only the LDA+MLLT transform
+  # that we need.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+     --splice-opts "--left-context=3 --right-context=3" \
+     3000 10000 $temp_data_root/${train_set}_hires data/lang \
+      $gmm_dir exp/nnet3${nnet3_affix}/tri5
 fi
 
+
 if [ $stage -le 4 ]; then
-  mkdir -p exp/nnet3
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
-     --num-frames 400000 data/train_si84_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
 if [ $stage -le 5 ]; then
-  # even though $nj is just 10, each job uses multiple processes and threads.
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/train_si284_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
 if [ $stage -le 6 ]; then
-  # We extract iVectors on all the train_si284 data, which will be what we
-  # train the system on.
-
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284_hires \
-    data/train_si284_hires_max2
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # We now extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
 
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    data/train_si284_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train_si284 || exit 1;
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 8 ]; then
+  echo "$0: $feats already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
 fi
 
+
 if [ $stage -le 7 ]; then
-  rm exp/nnet3/.error 2>/dev/null
-  for data in test_eval92 test_dev93 test_eval93; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
-      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error &
-  done
-  wait
-  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
+if [ $stage -le 9 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
 exit 0;
diff --git a/egs/wsj/s5/local/nnet3/run_lstm.sh b/egs/wsj/s5/local/nnet3/run_lstm.sh
index 2454fb5be63..d9af546b49b 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# This script is deprecated, see run_tdnn_lstm.sh
+
 # this is a basic lstm script
 # LSTM script runs for more epochs than the TDNN script
 # and each epoch takes twice the time
@@ -125,4 +127,3 @@ if [ $stage -le 9 ]; then
 fi
 
 exit 0;
-
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
index 124b04949a0..311ee14d16a 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+
+# This script is deprecated.
+
 set -o pipefail
 set -e
 # this is run_discriminative.sh
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 337c5656de4..00000000000
--- a/egs/wsj/s5/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-# this is the standard "tdnn" system, built in nnet3; it's what we use to
-# call multi-splice.
-
-. cmd.sh
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-stage=0
-train_stage=-10
-dir=exp/nnet3/nnet_tdnn_a
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
-    --splice-indexes "-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
-    --cmd "$decode_cmd" \
-    --pnorm-input-dim 2000 \
-    --pnorm-output-dim 250 \
-    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
-fi
-
-
-if [ $stage -le 9 ]; then
-  # this does offline decoding that should give the same results as the real
-  # online decoding.
-  for lm_suffix in tgpr bd_tgpr; do
-    graph_dir=exp/tri4b/graph_${lm_suffix}
-    # use already-built graphs.
-    for year in eval92 dev93; do
-      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
-          --online-ivector-dir exp/nnet3/ivectors_test_$year \
-         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
-    done
-  done
-fi
-
-
-exit 0;
-
-# results:
-grep WER exp/nnet3/nnet_tdnn_a/decode_{tgpr,bd_tgpr}_{eval92,dev93}/scoring_kaldi/best_wer
-exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/scoring_kaldi/best_wer:%WER 6.03 [ 340 / 5643, 74 ins, 20 del, 246 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/wer_13_1.0
-exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/scoring_kaldi/best_wer:%WER 9.35 [ 770 / 8234, 162 ins, 84 del, 524 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/wer_11_0.5
-exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/scoring_kaldi/best_wer:%WER 3.81 [ 215 / 5643, 30 ins, 18 del, 167 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/wer_10_1.0
-exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/scoring_kaldi/best_wer:%WER 6.74 [ 555 / 8234, 69 ins, 72 del, 414 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/wer_11_0.0
-b03:s5:
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh b/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
deleted file mode 100755
index aefbcdd331b..00000000000
--- a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-
-
-# This version of the TDNN system is being built to have a similar configuration
-# to the one in local/online/run_nnet2.sh, for better comparability.
-
-. cmd.sh
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-stage=0
-train_stage=-10
-dir=exp/nnet3/nnet_tdnn_c
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
-    --splice-indexes "-1,0,1  -2,1  -4,2 0" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --io-opts "--max-jobs-run 12" \
-    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
-    --cmd "$decode_cmd" \
-    --pnorm-input-dim 2000 \
-    --pnorm-output-dim 250 \
-    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
-fi
-
-
-if [ $stage -le 9 ]; then
-  # this does offline decoding that should give the same results as the real
-  # online decoding.
-  for lm_suffix in tgpr bd_tgpr; do
-    graph_dir=exp/tri4b/graph_${lm_suffix}
-    # use already-built graphs.
-    for year in eval92 dev93; do
-      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
-          --online-ivector-dir exp/nnet3/ivectors_test_$year \
-         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
-    done
-  done
-fi
-
-# The following results compare this nnet3 decode with the matched nnet2 baseline.
-#
-#b06:s5: cat exp/nnet3/nnet_tdnn_c/decode_*/scoring_kaldi/best_wer 
-#%WER 6.40 [ 527 / 8234, 59 ins, 71 del, 397 sub ] exp/nnet3/nnet_tdnn_c/decode_bd_tgpr_dev93/wer_10_0.0
-#%WER 3.54 [ 200 / 5643, 18 ins, 17 del, 165 sub ] exp/nnet3/nnet_tdnn_c/decode_bd_tgpr_eval92/wer_10_1.0
-#%WER 9.11 [ 750 / 8234, 140 ins, 83 del, 527 sub ] exp/nnet3/nnet_tdnn_c/decode_tgpr_dev93/wer_10_1.0
-#%WER 6.22 [ 351 / 5643, 85 ins, 15 del, 251 sub ] exp/nnet3/nnet_tdnn_c/decode_tgpr_eval92/wer_10_1.0
-#b06:s5: 
-#b06:s5: cat exp/nnet2_online/nnet_ms_a/decode_*/scoring_kaldi/best_wer 
-#%WER 6.62 [ 545 / 8234, 56 ins, 79 del, 410 sub ] exp/nnet2_online/nnet_ms_a/decode_bd_tgpr_dev93/wer_13_0.0
-#%WER 3.70 [ 209 / 5643, 25 ins, 18 del, 166 sub ] exp/nnet2_online/nnet_ms_a/decode_bd_tgpr_eval92/wer_13_0.5
-#%WER 9.33 [ 768 / 8234, 157 ins, 73 del, 538 sub ] exp/nnet2_online/nnet_ms_a/decode_tgpr_dev93/wer_11_0.5
-#%WER 6.11 [ 345 / 5643, 92 ins, 14 del, 239 sub ] exp/nnet2_online/nnet_ms_a/decode_tgpr_eval92/wer_10_1.0
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh
new file mode 120000
index 00000000000..8e03c924bc1
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_lfr_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..078719b1114
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+#    This is the standard "tdnn" system, built in nnet3 with xconfigs.
+
+
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn1a_sp
+# System                tdnn1a_sp
+#WER dev93 (tgpr)                9.18
+#WER dev93 (tg)                  8.59
+#WER dev93 (big-dict,tgpr)       6.45
+#WER dev93 (big-dict,fg)         5.83
+#WER eval92 (tgpr)               6.15
+#WER eval92 (tg)                 5.55
+#WER eval92 (big-dict,tgpr)      3.58
+#WER eval92 (big-dict,fg)        2.98
+# Final train prob        -0.7200
+# Final valid prob        -0.8834
+# Final train acc          0.7762
+# Final valid acc          0.7301
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+tdnn_affix=1a  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=650
+  relu-renorm-layer name=tdnn2 dim=650 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=650 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=650 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=650 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          ${graph_dir} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..61223b8a135
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# 1b is like 1a, but using a different splicing setup; the difference
+# is like the difference from
+# egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn{c->b}.sh
+# There seems to be no consistent difference.
+
+#  run_tdnn_1a.sh is the standard "tdnn" system, built in nnet3 with xconfigs.
+
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn1a_sp exp/nnet3/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+#WER dev93 (tgpr)                9.18      9.12
+#WER dev93 (tg)                  8.59      8.51
+#WER dev93 (big-dict,tgpr)       6.45      6.19
+#WER dev93 (big-dict,fg)         5.83      5.78
+#WER eval92 (tgpr)               6.15      6.33
+#WER eval92 (tg)                 5.55      5.74
+#WER eval92 (big-dict,tgpr)      3.58      3.62
+#WER eval92 (big-dict,fg)        2.98      3.10
+# Final train prob        -0.7200   -0.6035
+# Final valid prob        -0.8834   -0.7578
+# Final train acc          0.7762    0.8015
+# Final valid acc          0.7301    0.7607
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+tdnn_affix=1b  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,2)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-3,3)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-7,2)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-3,3)
+  relu-renorm-layer name=tdnn6 dim=750
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          ${graph_dir} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..6369fdc3fed
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_1a.sh is a TDNN+LSTM system.  Compare with the TDNN
+# system in run_tdnn_1a.sh.  Configuration is similar to
+# the same-named script run_tdnn_lstm_1a.sh in
+# egs/tedlium/s5_r2/local/nnet3/tuning.
+
+# It's a little better than the TDNN-only script on dev93, a little
+# worse on eval92.
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm1a_sp
+# exp/nnet3/tdnn_lstm1a_sp: num-iters=102 nj=3..10 num-params=8.8M dim=40+100->3413 combine=-0.55->-0.54 loglike:train/valid[67,101,combined]=(-0.63,-0.55,-0.55/-0.71,-0.63,-0.63) accuracy:train/valid[67,101,combined]=(0.80,0.82,0.82/0.76,0.78,0.78)
+
+
+
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp 2>/dev/null
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp
+# System                tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                9.18      8.54
+#             [looped:]                    8.54
+#             [online:]                    8.57
+#WER dev93 (tg)                  8.59      8.25
+#             [looped:]                    8.21
+#             [online:]                    8.34
+#WER dev93 (big-dict,tgpr)       6.45      6.24
+#             [looped:]                    6.28
+#             [online:]                    6.40
+#WER dev93 (big-dict,fg)         5.83      5.70
+#             [looped:]                    5.70
+#             [online:]                    5.77
+#WER eval92 (tgpr)               6.15      6.52
+#             [looped:]                    6.45
+#             [online:]                    6.56
+#WER eval92 (tg)                 5.55      6.13
+#             [looped:]                    6.08
+#             [online:]                    6.24
+#WER eval92 (big-dict,tgpr)      3.58      3.88
+#             [looped:]                    3.93
+#             [online:]                    3.88
+#WER eval92 (big-dict,fg)        2.98      3.38
+#             [looped:]                    3.47
+#             [online:]                    3.53
+# Final train prob        -0.7200   -0.5492
+# Final valid prob        -0.8834   -0.6343
+# Final train acc          0.7762    0.8154
+# Final valid acc          0.7301    0.7849
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lang=data/lang
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $graph_dir data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode_looped.sh \
+          --frames-per-chunk 30 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $graph_dir data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/online/nnet3/decode.sh \
+          --nj $nj --cmd "$decode_cmd" \
+          $graph_dir data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
new file mode 100755
index 00000000000..f2a4ed37ae5
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -0,0 +1,335 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_lfr_1a.sh is modified from the same named script
+# in egs/tedlium/s5_r2/local/nnet3/tuning/.
+# Of course reducing the hidden-dims).
+# This is a low-frame-rate TDNN+LSTM system.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm_lfr1a_sp
+# exp/nnet3/tdnn_lstm_lfr1a_sp: num-iters=136 nj=3..10 num-params=8.7M dim=40+100->3205 combine=-0.43->-0.42 loglike:train/valid[89,135,combined]=(-0.51,-0.39,-0.38/-0.59,-0.51,-0.51) accuracy:train/valid[89,135,combined]=(0.85,0.88,0.88/0.82,0.84,0.84)
+
+
+# It seems to be a little worse the regular-frame-rate system.
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3/tdnn_lstm1a_sp exp/nnet3/tdnn_lstm_lfr1a_sp
+# System                tdnn_lstm1a_sp tdnn_lstm_lfr1a_sp
+#WER dev93 (tgpr)                8.54      9.02
+#             [looped:]          8.54      8.99
+#WER dev93 (tg)                  8.25      8.60
+#             [looped:]          8.21      8.54
+#WER dev93 (big-dict,tgpr)       6.24      6.85
+#             [looped:]          6.28      6.81
+#WER dev93 (big-dict,fg)         5.70      6.33
+#             [looped:]          5.70      6.33
+#WER eval92 (tgpr)               6.52      6.52
+#             [looped:]          6.45      6.42
+#WER eval92 (tg)                 6.13      6.01
+#             [looped:]          6.08      5.92
+#WER eval92 (big-dict,tgpr)      3.88      4.22
+#             [looped:]          3.93      4.20
+#WER eval92 (big-dict,fg)        3.38      3.76
+#             [looped:]          3.47      3.79
+# Final train prob        -0.5492   -0.3100
+# Final valid prob        -0.6343   -0.4646
+# Final train acc          0.8154    0.9051
+# Final valid acc          0.7849    0.8615
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm_lfr${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp \
+    $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_tgpr \
+                   $dir $dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_bd_tgpr \
+      $dir $dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 18 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/run_basis_fmllr.sh b/egs/wsj/s5/local/run_basis_fmllr.sh
index caddd7af8de..db5207cc333 100755
--- a/egs/wsj/s5/local/run_basis_fmllr.sh
+++ b/egs/wsj/s5/local/run_basis_fmllr.sh
@@ -18,7 +18,7 @@ for x in test_eval92 test_eval93 test_dev93 ; do
   cp -r data/$x data/$y
   cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
   cp data/$y/utt2spk data/$y/spk2utt;
-  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; 
+  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1;
 done
 
 
@@ -33,7 +33,7 @@ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
 
  # get the fMLLR basis.
 steps/get_fmllr_basis.sh --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri3b
+  data/train_si284 data/lang${lang_suffix} exp/tri3b
 
  # decoding tri3b with basis fMLLR
 steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
@@ -50,5 +50,3 @@ steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
 steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \
   exp/tri3b/graph${lang_suffix}_tgpr data/test_eval92_utt \
   exp/tri3b/decode${lang_suffix}_tgpr_eval92_basis_utt || exit 1;
-
-
diff --git a/egs/wsj/s5/local/run_mmi_tri2b.sh b/egs/wsj/s5/local/run_mmi_tri2b.sh
deleted file mode 100755
index d7ddbfbaf62..00000000000
--- a/egs/wsj/s5/local/run_mmi_tri2b.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-lang_suffix=
-
-echo "$0 $@"  # Print the command line for logging
-. utils/parse_options.sh || exit 1;
-
-. ./cmd.sh
-
-# Train and test MMI (and boosted MMI) on tri2b system.
-steps/make_denlats.sh --sub-split 20 --nj 10 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} \
-  exp/tri2b exp/tri2b_denlats_si84 || exit 1;
-
-# train the basic MMI system.
-steps/train_mmi.sh --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/tri2b_denlats_si84 exp/tri2b_mmi  || exit 1;
-for iter in 3 4; do
-  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_mmi/decode${lang_suffix}_tgpr_dev93_it$iter &
-  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \
-    exp/tri2b_mmi/decode${lang_suffix}_tgpr_eval92_it$iter &
-done
-
-# MMI with 0.1 boosting factor.
-steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/tri2b_denlats_si84 exp/tri2b_mmi_b0.1  || exit 1;
-
-for iter in 3 4; do
-  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
-     exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \
-     exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_eval92_it$iter &
-done
-
-
-# Train a UBM with 400 components, for fMMI.
-steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
-  400 data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 exp/dubm2b
-
-steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1
-
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_fmmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
-
-steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_fmmi_b0.1_lr0.005/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
-
-steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_indirect_b0.1
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-     exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-     exp/tri2b_fmmi_indirect_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index fb004117658..4d505f5da3a 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -1,7 +1,15 @@
 #!/bin/bash
 
+stage=0
+train=true   # set to false to disable the training-related scripts
+             # note: you probably only want to set --train false if you
+             # are using at least --stage 1.
+decode=true  # set to false to disable the decoding-related scripts.
+
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
+. utils/parse_options.sh  # e.g. this parses the --stage option if supplied.
+
 
 # This is a shell script, but it's recommended that you run the commands one by
 # one by copying and pasting into the shell.
@@ -18,334 +26,313 @@
 wsj0=/export/corpora5/LDC/LDC93S6B
 wsj1=/export/corpora5/LDC/LDC94S13B
 
-local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
 
-# Sometimes, we have seen WSJ distributions that do not have subdirectories
-# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
-# wsj0 or wsj1 directories. In such cases, try the following:
-#
-# corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
-# local/cstr_wsj_data_prep.sh $corpus
-# rm data/local/dict/lexiconp.txt
-# $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
-#
-# "nosp" refers to the dictionary before silence probabilities and pronunciation
-# probabilities are added.
-local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1;
-
-utils/prepare_lang.sh data/local/dict_nosp \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
-
-local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
-
- # We suggest to run the next three commands in the background,
- # as they are not a precondition for the system building and
- # most of the tests: these commands build a dictionary
- # containing many of the OOVs in the WSJ LM training data,
- # and an LM trained directly on that data (i.e. not just
- # copying the arpa files from the disks from LDC).
- # Caution: the commands below will only work if $decode_cmd
- # is setup to use qsub.  Else, just remove the --cmd option.
- # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
- # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
+if [ $stage -le 0 ]; then
+  # data preparation.
+  local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
+
+  # Sometimes, we have seen WSJ distributions that do not have subdirectories
+  # like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
+  # wsj0 or wsj1 directories. In such cases, try the following:
+  #
+  # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
+  # local/cstr_wsj_data_prep.sh $corpus
+  # rm data/local/dict/lexiconp.txt
+  # $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
+  #
+  # "nosp" refers to the dictionary before silence probabilities and pronunciation
+  # probabilities are added.
+  local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1;
+
+  utils/prepare_lang.sh data/local/dict_nosp \
+                        "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
+
+  local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
+
+  # We suggest to run the next three commands in the background,
+  # as they are not a precondition for the system building and
+  # most of the tests: these commands build a dictionary
+  # containing many of the OOVs in the WSJ LM training data,
+  # and an LM trained directly on that data (i.e. not just
+  # copying the arpa files from the disks from LDC).
+  # Caution: the commands below will only work if $decode_cmd
+  # is setup to use qsub.  Else, just remove the --cmd option.
+  # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
+  # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
   (
-   local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
-   utils/prepare_lang.sh data/local/dict_nosp_larger \
-     "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
-   local/wsj_train_lms.sh --dict-suffix "_nosp" &&
-   local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
+    local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
+      utils/prepare_lang.sh data/local/dict_nosp_larger \
+                            "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
+      local/wsj_train_lms.sh --dict-suffix "_nosp" &&
+      local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
   ) &
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
-
-for x in test_eval92 test_eval93 test_dev93 train_si284; do
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1;
-  steps/compute_cmvn_stats.sh data/$x || exit 1;
-done
-
-utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
-
-# Now make subset with the shortest 2k utterances from si-84.
-utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
-
-# Now make subset with half of the data from si-84.
-utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
-
-
-# Note: the --boost-silence option should probably be omitted by default
-# for normal setups.  It doesn't always help. [it's to discourage non-silence
-# models from modeling silence.]
-steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/mono0a exp/mono0a/graph_nosp_tgpr && \
- steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
-   data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
- steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
-   data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
-) &
-
-steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
-  data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
-  data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1;
-
-while [ ! -f data/lang_nosp_test_tgpr/tmp/LG.fst ] || \
-   [ -z data/lang_nosp_test_tgpr/tmp/LG.fst ]; do
-  sleep 20;
-done
-sleep 30;
-# or the mono mkgraph.sh might be writing
-# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
-
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1;
-
-steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
-  data/test_dev93 exp/tri1/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
-  data/test_eval92 exp/tri1/decode_nosp_tgpr_eval92 || exit 1;
-
-# test various modes of LM rescoring (4 is the default one).
-# This is just confirming they're equivalent.
-for mode in 1 2 3 4; do
-  steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
-    data/lang_nosp_test_{tgpr,tg} data/test_dev93 \
-    exp/tri1/decode_nosp_tgpr_dev93 \
-    exp/tri1/decode_nosp_tgpr_dev93_tg$mode  || exit 1;
-done
-
-
-## the following command demonstrates how to get lattices that are
-## "word-aligned" (arcs coincide with words, with boundaries in the right
-## place).
-#sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'`
-#steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
-#  data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \
-#  exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1;
-
-steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1;
-steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
-  data/test_eval92 exp/tri2b/decode_nosp_tgpr_eval92 || exit 1;
-
-# At this point, you could run the example scripts that show how VTLN works.
-# We haven't included this in the default recipes yet.
-# local/run_vtln.sh --lang-suffix "_nosp"
-# local/run_vtln2.sh --lang-suffix "_nosp"
-
-# Now, with dev93, compare lattice rescoring with biglm decoding,
-# going from tgpr to tg.  Note: results are not the same, even though they should
-# be, and I believe this is due to the beams not being wide enough.  The pruning
-# seems to be a bit too narrow in the current scripts (got at least 0.7% absolute
-# improvement from loosening beams from their current values).
-
-steps/decode_biglm.sh --nj 10 --cmd "$decode_cmd" \
-  exp/tri2b/graph_nosp_tgpr data/lang_test_{tgpr,tg}/G.fst \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93_tg_biglm
-
-# baseline via LM rescoring of lattices.
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 \
-  exp/tri2b/decode_nosp_tgpr_dev93_tg || exit 1;
-
-# Trying Minimum Bayes Risk decoding (like Confusion Network decoding):
-mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-cp exp/tri2b/decode_nosp_tgpr_dev93_tg/lat.*.gz \
-  exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-local/score_mbr.sh --cmd "$decode_cmd" \
- data/test_dev93/ data/lang_nosp_test_tgpr/ \
- exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-
-# This script trains a delta+delta-delta system.  It's not really recommended or
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+
+  for x in test_eval92 test_eval93 test_dev93 train_si284; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1;
+    steps/compute_cmvn_stats.sh data/$x || exit 1;
+  done
+
+  utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
+
+  # Now make subset with the shortest 2k utterances from si-84.
+  utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
+
+  # Now make subset with half of the data from si-84.
+  utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  # monophone
+
+
+  # Note: the --boost-silence option should probably be omitted by default
+  # for normal setups.  It doesn't always help. [it's to discourage non-silence
+  # models from modeling silence.]
+  if $train; then
+    steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+      data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr exp/mono0a exp/mono0a/graph_nosp_tgpr && \
+      steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
+        data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
+      steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
+        data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # tri1
+  if $train; then
+    steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+      data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
+
+    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
+      data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1;
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
+        data/test_${data} exp/tri1/decode_nosp_tgpr_${data} || exit 1;
+
+      # test various modes of LM rescoring (4 is the default one).
+      # This is just confirming they're equivalent.
+      for mode in 1 2 3 4; do
+        steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
+          data/lang_nosp_test_{tgpr,tg} data/test_dev93 \
+          exp/tri1/decode_nosp_tgpr_${data} \
+          exp/tri1/decode_nosp_tgpr_${data}_tg$mode  || exit 1;
+      done
+      # later on we'll demonstrate const-arpa LM rescoring, which is now
+      # the recommended method.
+    done
+
+    ## the following command demonstrates how to get lattices that are
+    ## "word-aligned" (arcs coincide with words, with boundaries in the right
+    ## place).
+    #sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'`
+    #steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
+    #  data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \
+    #  exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1;
+  fi
+fi
+
+
+if [ $stage -le 3 ]; then
+  # tri2b.  there is no special meaning in the "b"-- it's historical.
+  if $train; then
+    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+      data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1;
+
+    steps/train_lda_mllt.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
+      data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1;
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode.sh --nj ${nspk} --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} || exit 1;
+
+       # compare lattice rescoring with biglm decoding, going from tgpr to tg.
+      steps/decode_biglm.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri2b/graph_nosp_tgpr data/lang_test_{tgpr,tg}/G.fst \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data}_tg_biglm
+
+       # baseline via LM rescoring of lattices.
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} \
+        exp/tri2b/decode_nosp_tgpr_${data}_tg || exit 1;
+
+      # Demonstrating Minimum Bayes Risk decoding (like Confusion Network decoding):
+      mkdir exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
+      cp exp/tri2b/decode_nosp_tgpr_${data}_tg/lat.*.gz \
+         exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr;
+      local/score_mbr.sh --cmd "$decode_cmd"  \
+         data/test_${data}/ data/lang_nosp_test_tgpr/ \
+         exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
+    done
+  fi
+
+  # At this point, you could run the example scripts that show how VTLN works.
+  # We haven't included this in the default recipes.
+  # local/run_vtln.sh --lang-suffix "_nosp"
+  # local/run_vtln2.sh --lang-suffix "_nosp"
+fi
+
+
+# local/run_delas.sh trains a delta+delta-delta system.  It's not really recommended or
 # necessary, but it does contain a demonstration of the decode_fromlats.sh
 # script which isn't used elsewhere.
 # local/run_deltas.sh
 
-# Align tri2b system with si84 data.
-steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
-  --use-graphs true data/train_si84 \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_si84  || exit 1;
-
-local/run_mmi_tri2b.sh --lang-suffix "_nosp"
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/tri2b_ali_si84 exp/tri3b || exit 1;
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1;
-steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-  exp/tri3b/graph_nosp_tgpr data/test_dev93 \
-  exp/tri3b/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri3b/graph_nosp_tgpr data/test_eval92 \
-  exp/tri3b/decode_nosp_tgpr_eval92 || exit 1;
-
-# At this point you could run the command below; this gets
-# results that demonstrate the basis-fMLLR adaptation (adaptation
-# on small amounts of adaptation data).
-local/run_basis_fmllr.sh --lang-suffix "_nosp"
-
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
-  data/test_dev93 exp/tri3b/decode_nosp_tgpr_dev93 \
-  exp/tri3b/decode_nosp_tgpr_dev93_tg || exit 1;
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
-  data/test_eval92 exp/tri3b/decode_nosp_tgpr_eval92 \
-  exp/tri3b/decode_nosp_tgpr_eval92_tg || exit 1;
-
-# Trying the larger dictionary ("big-dict"/bd) + locally produced LM.
-utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
-  exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1;
-
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
-  exp/tri3b/graph_nosp_bd_tgpr data/test_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92 || exit 1;
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
-  exp/tri3b/graph_nosp_bd_tgpr data/test_dev93 \
-  exp/tri3b/decode_nosp_bd_tgpr_dev93 || exit 1;
-
-# Example of rescoring with ConstArpaLm.
-steps/lmrescore_const_arpa.sh \
-  --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92{,_fgconst} || exit 1;
-
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_bd_tgpr data/lang_nosp_test_bd_fg \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92_fg || exit 1;
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_bd_tgpr data/lang_nosp_test_bd_tg \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92_tg || exit 1;
-
-# The following two steps, which are a kind of side-branch, try mixing up
-( # from the 3b system.  This is to demonstrate that script.
- steps/mixup.sh --cmd "$train_cmd" \
-   20000 data/train_si84 data/lang_nosp exp/tri3b exp/tri3b_20k || exit 1;
- steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
-   exp/tri3b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri3b_20k/decode_nosp_tgpr_dev93  || exit 1;
-)
-
-# From 3b system, align all si284 data.
-steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/tri3b exp/tri3b_ali_si284 || exit 1;
-
-
-# From 3b system, train another SAT system (tri4a) with all the si284 data.
-
-steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
-  data/train_si284 data/lang_nosp exp/tri3b_ali_si284 exp/tri4a || exit 1;
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4a exp/tri4a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-
-# This step is just to demonstrate the train_quick.sh script, in which we
-# initialize the GMMs from the old system's GMMs.
-steps/train_quick.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284 data/lang_nosp exp/tri3b_ali_si284 exp/tri4b || exit 1;
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4b exp/tri4b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_nosp_tgpr data/test_eval92 \
-  exp/tri4b/decode_nosp_tgpr_eval92 || exit 1;
-
- utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
-   exp/tri4b exp/tri4b/graph_nosp_bd_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_nosp_bd_tgpr data/test_dev93 \
-   exp/tri4b/decode_nosp_bd_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_nosp_bd_tgpr data/test_eval92 \
-  exp/tri4b/decode_nosp_bd_tgpr_eval92 || exit 1;
-) &
-
-# Silprob for normal lexicon.
-steps/get_prons.sh --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/tri4b || exit 1;
-utils/dict_dir_add_pronprobs.sh --max-normalize true \
-  data/local/dict_nosp \
-  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
-
-utils/prepare_lang.sh data/local/dict \
-  "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
-
-for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do
-  mkdir -p data/lang_test_${lm_suffix}
-  cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1;
-  rm -rf data/lang_test_${lm_suffix}/tmp
-  cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/
-done
-
-# Silprob for larger lexicon.
-utils/dict_dir_add_pronprobs.sh --max-normalize true \
-  data/local/dict_nosp_larger \
-  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1
-
-utils/prepare_lang.sh data/local/dict_larger \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1;
-
-for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do
-  mkdir -p data/lang_test_bd_${lm_suffix}
-  cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1;
-  rm -rf data/lang_test_bd_${lm_suffix}/tmp
-  cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/
-done
-
-(
- utils/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92 || exit 1;
-
- utils/mkgraph.sh data/lang_test_bd_tgpr \
-   exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_bd_tgpr data/test_dev93 \
-   exp/tri4b/decode_bd_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_bd_tgpr data/test_eval92 \
-  exp/tri4b/decode_bd_tgpr_eval92 || exit 1;
-) &
+if [ $stage -le 4 ]; then
+  # From 2b system, train 3b which is LDA + MLLT + SAT.
+
+  # Align tri2b system with all the si284 data.
+  if $train; then
+    steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
+      data/train_si284 data/lang_nosp exp/tri2b exp/tri2b_ali_si284  || exit 1;
+
+    steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train_si284 data/lang_nosp exp/tri2b_ali_si284 exp/tri3b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1;
+
+    # the larger dictionary ("big-dict"/bd) + locally produced LM.
+    utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
+      exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1;
+
+    # At this point you could run the command below; this gets
+    # results that demonstrate the basis-fMLLR adaptation (adaptation
+    # on small amounts of adaptation data).
+    # local/run_basis_fmllr.sh --lang-suffix "_nosp"
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri3b/graph_nosp_tgpr data/test_${data} \
+        exp/tri3b/decode_nosp_tgpr_${data} || exit 1;
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
+        data/test_${data} exp/tri3b/decode_nosp_{tgpr,tg}_${data} || exit 1
+
+      # decode with big dictionary.
+      steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
+        exp/tri3b/graph_nosp_bd_tgpr data/test_${data} \
+        exp/tri3b/decode_nosp_bd_tgpr_${data} || exit 1;
+
+      # Example of rescoring with ConstArpaLm.
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \
+        data/test_${data} exp/tri3b/decode_nosp_bd_tgpr_${data}{,_fg} || exit 1;
+    done
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # Estimate pronunciation and silence probabilities.
+
+  # Silprob for normal lexicon.
+  steps/get_prons.sh --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/tri3b || exit 1;
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
+    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
+
+  utils/prepare_lang.sh data/local/dict \
+    "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+
+  for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do
+    mkdir -p data/lang_test_${lm_suffix}
+    cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1;
+    rm -rf data/lang_test_${lm_suffix}/tmp
+    cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/
+  done
+
+  # Silprob for larger ("bd") lexicon.
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp_larger \
+    exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
+    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1
+
+  utils/prepare_lang.sh data/local/dict_larger \
+    "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1;
+
+  for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do
+    mkdir -p data/lang_test_bd_${lm_suffix}
+    cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1;
+    rm -rf data/lang_test_bd_${lm_suffix}/tmp
+    cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/
+  done
+fi
+
+
+if [ $stage -le 6 ]; then
+  # From 3b system, now using data/lang as the lang directory (we have now added
+  # pronunciation and silence probabilities), train another SAT system (tri4b).
+
+  if $train; then
+    steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
+      data/train_si284 data/lang exp/tri3b exp/tri4b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_test_tgpr \
+      exp/tri4b exp/tri4b/graph_tgpr || exit 1;
+    utils/mkgraph.sh data/lang_test_bd_tgpr \
+      exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri4b/graph_tgpr data/test_${data} \
+        exp/tri4b/decode_tgpr_${data} || exit 1;
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_test_tgpr data/lang_test_tg \
+        data/test_${data} exp/tri4b/decode_{tgpr,tg}_${data} || exit 1
+
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri4b/graph_bd_tgpr data/test_${data} \
+        exp/tri4b/decode_bd_tgpr_${data} || exit 1;
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_bd_{tgpr,fgconst} \
+        data/test_${data} exp/tri4b/decode_bd_tgpr_${data}{,_fg} || exit 1;
+    done
+  fi
+fi
+
+
+exit 0;
+
+### Caution: the parts of the script below this statement are not run by default.
+###
 
 
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
   data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1;
+local/run_mmi_tri4b.sh
 
 # These demonstrate how to build a sytem usable for online-decoding with the nnet2 setup.
 # (see local/run_nnet2.sh for other, non-online nnet2 setups).
@@ -357,7 +344,6 @@ local/online/run_nnet2_discriminative.sh
 # default.
 # local/run_rnnlms.sh
 
-local/run_mmi_tri4b.sh
 
 #local/run_nnet2.sh
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 2d7f6f46cce..cdbbb00a68a 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -5,6 +5,7 @@
 # Apache 2.0.
 
 from __future__ import division
+import traceback
 import datetime
 import logging
 import re
@@ -332,20 +333,21 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"):
 
 def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
     times = parse_train_logs(exp_dir)
-    data = parse_prob_logs(exp_dir, key, output)
+
     report = []
     report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
+    try:
+        data = parse_prob_logs(exp_dir, key, output)
+    except:
+        tb = traceback.format_exc()
+        logger.warning("Error getting info from logs, exception was: " + tb)
+        data = []
     for x in data:
         try:
             report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
                                                   x[1], x[2], x[2]-x[1]))
         except KeyError:
             continue
-    if len(report) - 1 == 0:
-        raise KaldiLogParseException("Could not find any lines with {k} in "
-                " {e}/log/compute_prob_train.*.log or "
-                " {e}/log/compute_prob_valid.*.log or both".format(
-                    k=key, e=exp_dir))
 
     total_time = 0
     for iter in times.keys():
diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh
index ba1f4487297..3cf28cd70bd 100755
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
@@ -100,7 +100,7 @@ case "$mode" in
      # grammar and transition weights.
     mdl=`dirname $indir`/final.mdl
     [ ! -f $mdl ] && echo No such model $mdl && exit 1;
-    [[ -f `dirname $indir`/frame_subsampling_factor && $self_loop_scale != 1.0 ]] && \
+    [[ -f `dirname $indir`/frame_subsampling_factor && "$self_loop_scale" == 0.1 ]] && \
       echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
       gunzip -c $indir/lat.JOB.gz \| \
diff --git a/egs/wsj/s5/steps/mixup.sh b/egs/wsj/s5/steps/mixup.sh
deleted file mode 100755
index 238d8cab77f..00000000000
--- a/egs/wsj/s5/steps/mixup.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
-
-# mix up (or down); do 3 iters of model training; realign; then do two more
-# iterations of model training.
-
-# Begin configuration section.
-cmd=run.pl
-beam=10
-retry_beam=40
-boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-num_iters=5
-realign_iters=3 # Space-separated list of iterations to realign on.
-stage=0
-# End configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-[ -f path.sh ] && . ./path.sh;
-. parse_options.sh || exit 1;
-
-if [ $# != 5 ]; then
-   echo "Usage: steps/mixup.sh <num-gauss> <data-dir> <lang-dir> <old-exp-dir> <exp-dir>"
-   echo " e.g.: steps/mixup.sh 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k"
-   echo "main options (for others, see top of script file)"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --config <config-file>                           # config containing options"
-   echo "  --stage <stage>                                  # stage to do partial re-run from."
-   exit 1;
-fi
-
-numgauss=$1
-data=$2
-lang=$3
-srcdir=$4
-dir=$5
-
-for f in $data/feats.scp $srcdir/final.mdl $srcdir/final.mat; do
-  [ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1;
-done
-
-nj=`cat $srcdir/num_jobs` || exit 1;
-sdata=$data/split$nj;
-
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-
-mkdir -p $dir/log
-cp $srcdir/splice_opts $dir 2>/dev/null
-cp $srcdir/cmvn_opts $dir 2>/dev/null
-cp $srcdir/final.mat $dir
-echo $nj > $dir/num_jobs
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-cp $srcdir/tree $dir
-
-
-## Set up features.
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
-    cp $srcdir/final.mat $dir    
-    ;;
-  *) echo "Invalid feature type $feat_type" && exit 1;
-esac
-if [ -f $srcdir/trans.1 ]; then
-  echo Using transforms from $srcdir;
-  rm $dir/trans.* 2>/dev/null
-  ln.pl $srcdir/trans.* $dir  # Link those transforms to current directory.
-  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
-else
-  feats="$sifeats"
-fi
-## Done setting up features.
-
-rm $dir/fsts.*.gz 2>/dev/null
-ln.pl $srcdir/fsts.*.gz $dir  # Link training-graph FSTs to current directory.
-
-## Mix up old model
-if [ $stage -le 0 ]; then
-  echo Mixing up old model to $numgauss Gaussians
-# Note: this script also works for mixing down.
-  $cmd $dir/log/mixup.log \
-    gmm-mixup --mix-up=$numgauss --mix-down=$numgauss \
-    $srcdir/final.mdl $srcdir/final.occs $dir/1.mdl || exit 1;
-fi
-## Done.
-
-cur_alidir=$srcdir # dir to find alignments.
-[ -z "$realign_iters" ] && ln.pl $srcdir/ali.*.gz $dir; # link alignments, if
- # we won't be generating them.
-
-x=1
-while [ $x -le $num_iters ]; do
-  echo "$0: iteration $x"
-  if echo $realign_iters | grep -w $x >/dev/null; then
-    if [ $stage -le $x ]; then
-      echo "$0: realigning data"
-      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
-      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
-        gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 "$mdl" \
-        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
-        "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
-    fi
-    cur_alidir=$dir
-  fi
-  if [ $stage -le $x ]; then
-    echo "$0: accumulating statistics"
-    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
-      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
-      "ark,s,cs:gunzip -c $cur_alidir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
-    echo "$0: re-estimating model"
-    [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;
-    $cmd $dir/log/update.$x.log \
-      gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
-      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
-    rm $dir/$x.mdl $dir/$x.*.acc
-    rm $dir/$x.occs  2>/dev/null
-  fi
-  x=$[$x+1]
-done
-
-rm $dir/final.mdl $dir/final.occs 2>/dev/null
-ln -s $x.mdl $dir/final.mdl
-ln -s $x.occs $dir/final.occs
-
-if [ -f $dir/trans.1 ]; then 
-  echo "$0: accumulating stats for alignment model."
-  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
-    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
-    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
-    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
-  [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;  
-  echo "$0: Re-estimating alignment model."
-  $cmd $dir/log/est_alimdl.log \
-    gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
-    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
-  rm $dir/$x.*.acc
-  rm $dir/final.alimdl 2>/dev/null
-  ln -s $x.alimdl $dir/final.alimdl 
-fi
-
-utils/summarize_warnings.pl $dir/log
-
-echo Done
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index 0333d628544..bb8efd56ab8 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -22,12 +22,13 @@ mkdir -p $data/.backup
 
 [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
 
+set -e -o pipefail -u
+
 tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
 trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
 
 export LC_ALL=C
 
-
 function check_sorted {
   file=$1
   sort -k1,1 -u <$file >$file.tmp
@@ -54,8 +55,8 @@ function filter_file {
   cp $file_to_filter ${file_to_filter}.tmp
   utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
   if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
-    length1=`cat ${file_to_filter}.tmp | wc -l`
-    length2=`cat ${file_to_filter} | wc -l`
+    length1=$(cat ${file_to_filter}.tmp | wc -l)
+    length2=$(cat ${file_to_filter} | wc -l)
     if [ $length1 -ne $length2 ]; then
       echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
     fi
@@ -77,7 +78,7 @@ function filter_recordings {
       exit 1;
     fi
     awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
-    n1=`cat $tmpdir/recordings | wc -l`
+    n1=$(cat $tmpdir/recordings | wc -l)
     [ ! -s $tmpdir/recordings ] && \
       echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
     utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index 42204b85e7d..65ff3c3c79d 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -75,7 +75,7 @@ fi
 N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
 P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }
 
-[[ -f $2/frame_subsampling_factor && $loopscale != 1.0 ]] && \
+[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
   echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
 
 mkdir -p $lang/tmp
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 49c929207b9..58e51a75aef 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -132,7 +132,7 @@ if [ -f $data/wav.scp ]; then
     check_sorted_and_uniq $data/segments
     # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
     ! cat $data/segments | \
-      awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
+      awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
       echo "$0: badly formatted segments file" && exit 1;
 
     segments_len=`cat $data/segments | wc -l`

From 8a5ee4a221de0989c665d17333414fa03e087eda Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Mon, 27 Feb 2017 13:33:35 -0500
Subject: [PATCH 175/213] [scripts,egs] Replace SGE-specific options to
 queue.pl (etc.) with generic options (#1461)

Search for new-style http://kaldi-asr.org/doc/queue.html for an explanation of the difference.  It makes switching to new queueing mechanisms easier.
---
 .../s5/local/online/run_nnet2_ms_perturbed.sh | 14 ++---
 .../s5/local/online/run_nnet2_ms_sp_disc.sh   | 18 +++---
 egs/aspire/s5/local/multi_condition/decode.sh |  2 +-
 .../s5/local/multi_condition/run_nnet2_ms.sh  |  4 +-
 .../multi_condition/run_nnet2_ms_disc.sh      | 18 +++---
 egs/callhome_egyptian/s5/run.sh               | 18 +++---
 egs/chime3/s5/cmd.sh                          |  6 +-
 egs/csj/s5/local/csj_run_rnnlm.sh             | 12 ++--
 egs/csj/s5/local/nnet/run_lstm.sh             |  4 +-
 egs/fisher_callhome_spanish/s5/run.sh         |  8 +--
 .../s5/local/nnet2/run_6c_gpu.sh              |  2 +-
 .../s5/local/online/run_nnet2.sh              |  2 +-
 .../s5/local/online/run_nnet2_b.sh            | 16 +++---
 .../local/online/run_nnet2_discriminative.sh  | 22 ++++----
 .../s5/local/online/run_nnet2_multisplice.sh  |  2 +-
 .../s5/local/online/run_nnet2_ms.sh           | 20 +++----
 egs/gale_arabic/s5/local/online/run_nnet2.sh  | 16 +++---
 egs/hkust/s5/local/online/run_nnet2_ms.sh     |  4 +-
 egs/librispeech/s5/local/nnet2/run_5c.sh      |  8 +--
 .../s5/local/nnet2/run_6a_clean_460.sh        | 10 ++--
 egs/librispeech/s5/local/nnet2/run_7a_960.sh  |  8 +--
 egs/librispeech/s5/local/online/run_nnet2.sh  | 10 ++--
 .../s5/local/online/run_nnet2_disc.sh         | 14 ++---
 .../s5/local/online/run_nnet2_ms.sh           | 10 ++--
 .../s5/local/online/run_nnet2_ms_disc.sh      | 14 ++---
 .../s5/local/online_pitch/run_nnet2_ms.sh     | 10 ++--
 egs/lre/v1/lid/train_diag_ubm.sh              |  6 +-
 egs/lre/v1/lid/train_ivector_extractor.sh     | 12 ++--
 egs/lre/v1/run.sh                             | 14 ++---
 egs/lre07/v1/lid/nnet2/get_egs2.sh            | 18 +++---
 .../v1/lid/nnet2/train_multisplice_accel2.sh  | 36 ++++++------
 egs/lre07/v1/lid/train_diag_ubm.sh            |  4 +-
 egs/lre07/v1/lid/train_ivector_extractor.sh   |  2 +-
 .../v1/lid/train_ivector_extractor_dnn.sh     |  2 +-
 egs/lre07/v1/run.sh                           | 12 ++--
 .../v2/local/dnn/run_nnet2_multisplice.sh     | 10 ++--
 egs/lre07/v2/run.sh                           | 12 ++--
 egs/rm/s5/local/nnet2/run_4b_gpu.sh           |  2 +-
 egs/rm/s5/local/nnet2/run_4c.sh               |  8 +--
 egs/rm/s5/local/nnet2/run_4d.sh               |  8 +--
 egs/rm/s5/local/nnet2/run_4d2.sh              |  8 +--
 egs/rm/s5/local/nnet2/run_4d3.sh              |  8 +--
 egs/rm/s5/local/nnet2/run_4e_gpu.sh           |  2 +-
 egs/rm/s5/local/nnet2/run_5c.sh               |  6 +-
 egs/rm/s5/local/nnet2/run_5c_gpu.sh           | 12 ++--
 egs/rm/s5/local/nnet2/run_5d.sh               | 16 +++---
 egs/rm/s5/local/nnet2/run_5d_gpu.sh           | 12 ++--
 egs/rm/s5/local/nnet2/run_5e_gpu.sh           | 12 ++--
 egs/rm/s5/local/online/run_nnet2.sh           | 10 ++--
 egs/rm/s5/local/online/run_nnet2_baseline.sh  |  8 +--
 egs/rm/s5/local/online/run_nnet2_common.sh    |  8 +--
 .../s5/local/online/run_nnet2_multisplice.sh  | 10 ++--
 .../online/run_nnet2_multisplice_disc.sh      | 12 ++--
 egs/rm/s5/local/online/run_nnet2_perturbed.sh | 12 ++--
 egs/rm/s5/local/online/run_nnet2_wsj.sh       | 28 +++++-----
 egs/rm/s5/local/online/run_nnet2_wsj_joint.sh | 16 +++---
 .../local/online/run_nnet2_wsj_joint_disc.sh  | 12 ++--
 egs/rm/s5/local/run_dnn_convert_nnet2.sh      |  4 +-
 egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh  |  2 +-
 egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh |  6 +-
 egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh  |  8 +--
 egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh  |  2 +-
 egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh  |  8 +--
 egs/sre08/v1/local/run_more_data.sh           | 24 ++++----
 egs/sre08/v1/run.sh                           | 16 +++---
 egs/sre08/v1/sid/nnet2/get_egs2.sh            | 18 +++---
 .../v1/sid/nnet2/train_multisplice_accel2.sh  | 14 ++---
 egs/sre08/v1/sid/train_diag_ubm.sh            |  4 +-
 .../v1/sid/train_ivector_extractor_dnn.sh     |  6 +-
 .../v1/local/dnn/run_nnet2_multisplice.sh     |  4 +-
 egs/sre10/v2/cmd.sh                           |  8 +--
 egs/swahili/s5/cmd.sh                         |  8 +--
 egs/swbd/s5b/local/nnet2/run_5a_gpu.sh        |  2 +-
 egs/swbd/s5b/local/nnet2/run_5b_gpu.sh        |  2 +-
 egs/swbd/s5b/local/nnet2/run_5c_gpu.sh        |  2 +-
 egs/swbd/s5b/local/nnet2/run_5d_gpu.sh        |  2 +-
 egs/swbd/s5b/local/nnet2/run_5e_gpu.sh        |  2 +-
 egs/swbd/s5b/local/nnet2/run_5f_gpu.sh        |  2 +-
 egs/swbd/s5b/local/nnet2/run_6a_gpu.sh        |  2 +-
 egs/swbd/s5b/local/nnet2/run_6c_gpu.sh        | 14 ++---
 egs/swbd/s5b/local/online/run_nnet2.sh        | 20 +++----
 .../s5b/local/online/run_nnet2_baseline.sh    |  8 +--
 egs/swbd/s5b/local/online/run_nnet2_fisher.sh | 26 ++++-----
 egs/swbd/s5b/local/online/run_nnet2_ms.sh     | 12 ++--
 .../s5b/local/online/run_nnet2_ms_disc.sh     | 14 ++---
 .../local/online/run_nnet2_perturb_speed.sh   | 24 ++++----
 egs/swbd/s5c/local/online/run_nnet2_ms.sh     | 12 ++--
 .../local/online/run_nnet2_ms_perturbed.sh    | 12 ++--
 egs/tedlium/s5/cmd.sh                         | 10 ++--
 .../s5/local/online/run_nnet2_ensemble.sh     | 10 ++--
 egs/tedlium/s5/local/online/run_nnet2_ms.sh   | 10 ++--
 .../s5/local/online/run_nnet2_ms_disc.sh      | 14 ++---
 .../s5/local/online/run_nnet2_ms_perturbed.sh | 14 ++---
 egs/wsj/s5/local/nnet2/run_5b_gpu.sh          |  2 +-
 egs/wsj/s5/local/nnet2/run_5c.sh              | 12 ++--
 egs/wsj/s5/local/nnet2/run_5c2_gpu.sh         |  2 +-
 egs/wsj/s5/local/nnet2/run_5d.sh              |  8 +--
 egs/wsj/s5/local/nnet2/run_5e_gpu.sh          |  2 +-
 egs/wsj/s5/local/nnet2/run_6c_gpu.sh          | 12 ++--
 egs/wsj/s5/local/nnet2/run_6d.sh              |  6 +-
 egs/wsj/s5/local/nnet2/run_6d_gpu.sh          | 10 ++--
 egs/wsj/s5/local/nnet2/run_bnf.sh             | 30 +++++-----
 egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh       | 28 +++++-----
 egs/wsj/s5/local/online/run_nnet2_baseline.sh |  8 +--
 .../local/online/run_nnet2_discriminative.sh  | 20 +++----
 .../local/online/run_nnet2_perturb_speed.sh   |  2 +-
 egs/wsj/s5/local/run_bnf_sgmm.sh              | 20 +++----
 egs/wsj/s5/steps/nnet2/get_egs2.sh            |  2 +-
 .../s5/steps/nnet2/get_egs_discriminative2.sh |  2 +-
 egs/wsj/s5/steps/nnet2/retrain_fast.sh        | 18 +++---
 egs/wsj/s5/steps/nnet2/retrain_simple2.sh     | 24 ++++----
 egs/wsj/s5/steps/nnet2/train_block.sh         | 26 ++++-----
 .../s5/steps/nnet2/train_convnet_accel2.sh    | 48 ++++++++--------
 .../steps/nnet2/train_multisplice_accel2.sh   |  4 +-
 .../steps/nnet2/train_multisplice_ensemble.sh | 56 +++++++++----------
 egs/wsj/s5/steps/nnet2/train_pnorm.sh         | 36 ++++++------
 egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh  |  4 +-
 .../nnet2/train_pnorm_bottleneck_fast.sh      | 34 +++++------
 .../s5/steps/nnet2/train_pnorm_ensemble.sh    | 44 +++++++--------
 egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh    | 32 +++++------
 .../s5/steps/nnet2/train_pnorm_multisplice.sh | 30 +++++-----
 .../steps/nnet2/train_pnorm_multisplice2.sh   | 26 ++++-----
 egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh  | 30 +++++-----
 egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh | 50 ++++++++---------
 egs/wsj/s5/steps/nnet2/train_tanh.sh          |  4 +-
 .../s5/steps/nnet2/train_tanh_bottleneck.sh   | 32 +++++------
 egs/wsj/s5/steps/nnet2/train_tanh_fast.sh     | 32 +++++------
 egs/wsj/s5/steps/nnet2/update_nnet.sh         | 14 ++---
 egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh    |  8 +--
 egs/wsj/s5/steps/nnet3/get_degs.sh            |  2 +-
 .../s5/steps/nnet3/get_egs_discriminative.sh  |  2 +-
 egs/wsj/s5/steps/nnet3/lstm/train.sh          |  6 +-
 egs/wsj/s5/steps/nnet3/tdnn/train.sh          |  6 +-
 egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh |  6 +-
 egs/wsj/s5/steps/nnet3/train_tdnn.sh          |  6 +-
 egs/wsj/s5/steps/online/nnet2/get_egs.sh      | 16 +++---
 .../online/nnet2/get_egs_discriminative2.sh   |  2 +-
 egs/wsj/s5/utils/convert_slf_parallel.sh      |  4 +-
 138 files changed, 862 insertions(+), 862 deletions(-)

diff --git a/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh b/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
index 24176d69a34..a6c2d02b7af 100755
--- a/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
+++ b/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
@@ -8,7 +8,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -27,13 +27,13 @@ fix_nnet=false
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="--gpu 1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
@@ -52,7 +52,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 dir=exp/$mic/nnet2_online/nnet_ms_sp${affix:+_$affix}
@@ -155,7 +155,7 @@ fi
 wait;
 
 if [ $stage -le 12 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev eval; do
     (
@@ -191,7 +191,7 @@ if [ $stage -le 14 ]; then
       steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
         --per-utt true --online false $graph_dir data/$mic/${decode_set}_hires \
           $decode_dir || exit 1;
-    ) & 
+    ) &
   done
 fi
 wait;
diff --git a/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh b/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
index f8711c24025..8e5fc093807 100755
--- a/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
+++ b/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -42,19 +42,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts=" -l gpu=1,hostname='!g01*&!g02*' " #we want to submit to all.q as we use multiple GPUs for this 
+  parallel_opts="--gpu 1" #we want to submit to all.q as we use multiple GPUs for this
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ -z $srcdir ]; then
@@ -76,7 +76,7 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # max total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/$mic/nnet2_online/ivectors_train_hires_sp2 \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.conf \
      data/$mic/train_hires_sp data/lang $srcdir ${srcdir}_denlats || exit 1;
@@ -142,7 +142,7 @@ if [ $stage -le 5 ]; then
       (
         num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
         decode_dir=$dir/decode_epoch${epoch}_${decode_set}_utt
-        
+
         steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
         --per-utt true  --iter epoch$epoch $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1
       ) &
@@ -154,13 +154,13 @@ if [ $stage -le 5 ]; then
       (
         num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
         decode_dir=$dir/decode_epoch${epoch}_${decode_set}_utt_offline
-        
+
         steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
         --per-utt true --online false --iter epoch$epoch $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1
       ) &
     done
   done
-  
+
   wait
 fi
 
diff --git a/egs/aspire/s5/local/multi_condition/decode.sh b/egs/aspire/s5/local/multi_condition/decode.sh
index 566524095a6..b09c4780e71 100755
--- a/egs/aspire/s5/local/multi_condition/decode.sh
+++ b/egs/aspire/s5/local/multi_condition/decode.sh
@@ -47,7 +47,7 @@ if [ $# -ne 3 ]; then
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
-  echo "  --parallel-opts <opts>                   # e.g. '-pe smp 4' if you supply --num-threads 4"
+  echo "  --parallel-opts <opts>                   # e.g. '--num-threads 4' if you supply --num-threads 4"
   exit 1;
 fi
 
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
index 3b778b23162..4e34c78255a 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
@@ -28,7 +28,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 
@@ -47,7 +47,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads"
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
index ad5fba0929f..dc285f28f8e 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
@@ -8,7 +8,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -42,20 +42,20 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1 -q g.q"    
+  parallel_opts="--gpu 1"
   #parallel_opts="$parallel_opts --config conf/queue_no_k20.conf --allow-k20 false"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}/final.mdl ]; then
@@ -70,13 +70,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=70 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_multicondition/ivectors_train \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_rvb_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 fi
@@ -84,8 +84,8 @@ fi
 if [ $stage -le 2 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
-  nj=1500 # this is 6k hours, use more jobs and control the speed dynamically using 
-          # throttle control option (--max-jobs-run with qalter) 
+  nj=1500 # this is 6k hours, use more jobs and control the speed dynamically using
+          # throttle control option (--max-jobs-run with qalter)
           # have a high number of jobs because this could take a while, and we might
           # have some stragglers.
   max_jobs_run=200
diff --git a/egs/callhome_egyptian/s5/run.sh b/egs/callhome_egyptian/s5/run.sh
index 9d1fa692da0..4d1359bea98 100755
--- a/egs/callhome_egyptian/s5/run.sh
+++ b/egs/callhome_egyptian/s5/run.sh
@@ -29,7 +29,7 @@ local/callhome_prepare_dict.sh $eca_lexicon
 utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
 # Make sure that you do not use your test and your dev sets to train the LM
-# Some form of cross validation is possible where you decode your dev/set based on an 
+# Some form of cross validation is possible where you decode your dev/set based on an
 # LM that is trained on  everything but that that conversation
 local/callhome_train_lms.sh $split
 local/callhome_create_test_lang.sh
@@ -100,7 +100,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \
    exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
 )&
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
@@ -108,7 +108,7 @@ steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
 
 steps/train_sat.sh  --cmd "$train_cmd" \
   2200 25000 data/train data/lang exp/tri3a_ali  exp/tri4a || exit 1;
-                                                                                 
+
 (
   utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -140,9 +140,9 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 )&
 
 dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
-                       --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+                       --parallel-opts "--num-threads 16" --cmd "queue.pl  --mem 1G")
 dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
-                       --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+                       --parallel-opts "--gpu 1" --cmd "queue.pl  --mem 1G")
 
 steps/nnet2/train_pnorm_ensemble.sh \
   --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
@@ -153,17 +153,17 @@ steps/nnet2/train_pnorm_ensemble.sh \
   data/train data/lang exp/tri5a_ali exp/tri6a_dnn
 
 (
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
 ) &
 
 # Decode test sets
 (
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_test exp/tri5a/graph data/test exp/tri6a_dnn/decode_test
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_sup exp/tri5a/graph data/sup exp/tri6a_dnn/decode_sup
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_h5 exp/tri5a/graph data/h5 exp/tri6a_dnn/decode_h5
 ) &
 
diff --git a/egs/chime3/s5/cmd.sh b/egs/chime3/s5/cmd.sh
index 7ee5fbcd73d..cf2570db1a9 100755
--- a/egs/chime3/s5/cmd.sh
+++ b/egs/chime3/s5/cmd.sh
@@ -6,9 +6,9 @@
 # the number of cpus on your machine.
 
 #a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
+#export train_cmd="queue.pl"
+#export decode_cmd="queue.pl --mem 4G"
+#export mkgraph_cmd="queue.pl --mem 4G"
 
 #export cuda_cmd="..."
 
diff --git a/egs/csj/s5/local/csj_run_rnnlm.sh b/egs/csj/s5/local/csj_run_rnnlm.sh
index 5c6cd4343f6..e02f19bb680 100755
--- a/egs/csj/s5/local/csj_run_rnnlm.sh
+++ b/egs/csj/s5/local/csj_run_rnnlm.sh
@@ -3,7 +3,7 @@
 # Copyright  2016 Tokyo Institute of Technology (Authors: Tomohiro Tanaka, Takafumi Moriya and Takahiro Shinozaki)
 #            2016 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
 # Apache 2.0
-# Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055. 
+# Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055.
 
 [ -f ./path.sh ] && . ./path.sh
 . utils/parse_options.sh
@@ -21,7 +21,7 @@ echo h30 Begin
 local/csj_train_rnnlms.sh --dict-suffix "_nosp" data/local/rnnlm.h30
 sleep 20; # wait till tools compiled.
 
-echo h100 Begin 
+echo h100 Begin
 local/csj_train_rnnlms.sh --dict-suffix "_nosp" \
     --hidden 100 --nwords 10000 --class 200 \
     --direct 0 data/local/rnnlm.h100
@@ -60,9 +60,9 @@ for dict in rnnlm.h30 rnnlm.h100 rnnlm.h200 rnnlm.h300 rnnlm.h400 rnnlm.h500 ;do
 
       echo "rnnlm0.5"
       steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver \
-        --N 100 --cmd "queue -l mem_free=1G" --inv-acwt $acwt 0.5 \
+        --N 100 --cmd "$decode_cmd --mem 1G" --inv-acwt $acwt 0.5 \
         data/lang_csj_tg $dir data/$eval_num $sourcedir ${resultsdir}_L0.5
-      
+
       rm -rf ${resultsdir}_L0.25
       rm -rf ${resultsdir}_L0.75
       cp -rp ${resultsdir}_L0.5 ${resultsdir}_L0.25
@@ -70,12 +70,12 @@ for dict in rnnlm.h30 rnnlm.h100 rnnlm.h200 rnnlm.h300 rnnlm.h400 rnnlm.h500 ;do
 
       echo "rnnlm0.25"
       steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver \
-        --stage 7 --N 100 --cmd "$decode_cmd -l mem_free=1G" --inv-acwt $acwt 0.25 \
+        --stage 7 --N 100 --cmd "$decode_cmd --mem 1G" --inv-acwt $acwt 0.25 \
         data/lang_csj_tg $dir data/$eval_num $sourcedir ${resultsdir}_L0.25
 
       echo "rnnlm0.75"
       steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver \
-        --stage 7 --N 100 --cmd "$decode_cmd -l mem_free=1G" --inv-acwt $acwt 0.75 \
+        --stage 7 --N 100 --cmd "$decode_cmd --mem 1G" --inv-acwt $acwt 0.75 \
         data/lang_csj_tg $dir data/$eval_num $sourcedir ${resultsdir}_L0.75
   done
 done
diff --git a/egs/csj/s5/local/nnet/run_lstm.sh b/egs/csj/s5/local/nnet/run_lstm.sh
index 3cc330c55a8..dc0f40dec24 100755
--- a/egs/csj/s5/local/nnet/run_lstm.sh
+++ b/egs/csj/s5/local/nnet/run_lstm.sh
@@ -34,10 +34,10 @@ stage=0
         steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 $dir $dir/log $dir/data || exit 1;
         steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
     done
-    
+
   # Training set
   utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
      $train $train/log $train/data || exit 1;
   steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
   # Split the training set
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 380a8aec936..ad650cd390e 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -256,7 +256,7 @@ steps/train_mmi_sgmm2.sh \
 
 (
 utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
-steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
+steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
   --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 12"\
  exp/tri5a/graph data/dev exp/tri5a/decode_dev
 utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
@@ -274,9 +274,9 @@ done
 
 
 dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
-                       --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 --mem 2G")
+                       --parallel-opts "--num-threads 16" --cmd "queue.pl  --mem 2G")
 dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
-                       --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 --mem 2G")
+                       --parallel-opts "--gpu 1" --cmd "queue.pl  --mem 2G")
 
 steps/nnet2/train_pnorm_ensemble.sh \
   --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
@@ -287,7 +287,7 @@ steps/nnet2/train_pnorm_ensemble.sh \
   data/train data/lang exp/tri5a_ali exp/tri6a_dnn
 
 (
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
 ) &
 wait
diff --git a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
index eae5f7b8581..210d0f5646f 100755
--- a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
@@ -21,7 +21,7 @@ EOF
 
 
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ "$USER" == dpovey ]; then
diff --git a/egs/fisher_english/s5/local/online/run_nnet2.sh b/egs/fisher_english/s5/local/online/run_nnet2.sh
index 0b9adb7d315..de4d56bb52e 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2.sh
@@ -21,7 +21,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1" 
+parallel_opts="--gpu 1" 
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_a
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_b.sh b/egs/fisher_english/s5/local/online/run_nnet2_b.sh
index 7eac7cf0a7d..e1491a10c0b 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_b.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_b.sh
@@ -19,22 +19,22 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
-  dir=exp/nnet2_online/nnet_b_gpu 
+  dir=exp/nnet2_online/nnet_b_gpu
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_b
 fi
 
@@ -124,7 +124,7 @@ if [ $stage -le 7 ]; then
 fi
 
 if [ $stage -le 8 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
       exp/tri5a/graph data/dev ${dir}_online/decode_dev || exit 1;
@@ -161,13 +161,13 @@ exit 0;
 #%WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
 
 # The current "b" experiment, with per-utterance decoding.
-#grep WER exp/nnet2_online/nnet_b_gpu/decode_dev/wer_* | utils/best_wer.sh 
+#grep WER exp/nnet2_online/nnet_b_gpu/decode_dev/wer_* | utils/best_wer.sh
 #%WER 24.84 [ 9724 / 39141, 1446 ins, 2372 del, 5906 sub ] exp/nnet2_online/nnet_b_gpu/decode_dev/wer_10
 
 
 #The same with online decoding:
 #%WER 24.05 [ 9415 / 39141, 1413 ins, 2332 del, 5670 sub ] exp/nnet2_online/nnet_b_gpu_online/decode_dev_utt/wer_11
-grep WER exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_* | utils/best_wer.sh 
+grep WER exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_* | utils/best_wer.sh
 %WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
 
 
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh b/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
index 55b350a8faa..3b51a40a506 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
@@ -15,23 +15,23 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  gpu_opts="-l gpu=1"
-  train_parallel_opts="-l gpu=1"
+  gpu_opts="--gpu 1"
+  train_parallel_opts="--gpu 1"
   num_threads=1
   # the _a is in case I want to change the parameters.
-  srcdir=exp/nnet2_online/nnet_a_gpu 
+  srcdir=exp/nnet2_online/nnet_a_gpu
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   gpu_opts=""
   num_threads=16
-  train_parallel_opts="-pe smp 16"
+  train_parallel_opts="--num-threads 16"
   srcdir=exp/nnet2_online/nnet_a
 fi
 
@@ -45,8 +45,8 @@ if [ $stage -le 1 ]; then
   # the graph search and lattice determinization takes quite a bit of CPU.
   # note: it's the sub-split option that determinies how many jobs actually
   # run at one time.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "--num-threads 6" \
       --online-ivector-dir exp/nnet2_online/ivectors_train \
       data/train_hires data/lang $srcdir ${srcdir}_denlats
 fi
@@ -63,12 +63,12 @@ if [ $stage -le 3 ]; then
   if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
     utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/${srcdir}_smbr/degs ${srcdir}_smbr/degs/storage
   fi
-  # decreasing the learning rate by a factor of 2, due to having so much data, 
+  # decreasing the learning rate by a factor of 2, due to having so much data,
   # and decreasing the number of epochs for the same reason.
   # the io-opts option is to have more get_egs (and similar) jobs running at a time,
   # since we're using 4 disks.
   steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00001 \
-    --io-opts "-pe smp 10" \
+    --io-opts "--num-threads 10" \
     --num-epochs 4 \
     --use-preconditioning $use_preconditioning \
     --online-ivector-dir exp/nnet2_online/ivectors_train \
@@ -85,7 +85,7 @@ if [ $stage -le 4 ]; then
   done
 
   for epoch in 1 2 3 4; do
-    # do the actual online decoding with iVectors, carrying info forward from 
+    # do the actual online decoding with iVectors, carrying info forward from
     # previous utterances of the same speaker.
     steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter smbr_epoch${epoch} \
        exp/tri5a/graph data/dev ${srcdir}_online/decode_dev_smbr_epoch${epoch} || exit 1;
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
index 47ba36f0072..c3abed82739 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
@@ -21,7 +21,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1"
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
diff --git a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
index d9ca900ac63..4e86f08785e 100755
--- a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
+++ b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
@@ -16,13 +16,13 @@ set -e
 # assume use_gpu=true since it would be way too slow otherwise.
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1 -q g.q" 
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
@@ -37,7 +37,7 @@ if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
     utils/create_split_dir.pl /export/b0{6,7,8,9}/${USER}/kaldi-dsata/egs/fisher_swbd/s5/$dir/egs/storage $dir/egs/storage
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
   # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
@@ -73,12 +73,12 @@ fi
 
 if [ $stage -le 8 ]; then
   for test in eval2000 rt03; do
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
      steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
         exp/tri5a/graph_fsh_sw1_tg data/$test ${dir}_online/decode_${test}_fsh_sw1_tg || exit 1;
-  
-  # rescore 
+
+  # rescore
     if [ $rescore ]; then
          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
            data/lang_fsh_sw1_{tg,fg} data/${test} \
@@ -88,14 +88,14 @@ if [ $stage -le 8 ]; then
 fi
 
 if [ $stage -le 9 ]; then
-  for test in eval2000 rt03; do  
+  for test in eval2000 rt03; do
   # this version of the decoding treats each utterance separately
   # without carrying forward speaker information.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
      --per-utt true \
       exp/tri5a/graph_fsh_sw1_tg data/$test ${dir}_online/decode_${test}_utt_fsh_sw1_tg || exit 1;
-  
-  
+
+
   # rescore
     if [ $rescore ]; then
          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/gale_arabic/s5/local/online/run_nnet2.sh b/egs/gale_arabic/s5/local/online/run_nnet2.sh
index 8ccbda5a8dc..0db62242459 100644
--- a/egs/gale_arabic/s5/local/online/run_nnet2.sh
+++ b/egs/gale_arabic/s5/local/online/run_nnet2.sh
@@ -18,23 +18,23 @@ decode_nj=30
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
-  dir=exp/nnet2_online/nnet_a_gpu 
+  dir=exp/nnet2_online/nnet_a_gpu
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_a
 fi
 
@@ -123,13 +123,13 @@ if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-online/egs/bolt/s5/$dir/egs $dir/egs/storage || exit 1
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15) to (8).
   # The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
-  
+
   steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
     --num-epochs 8 \
     --samples-per-iter 400000 \
@@ -156,7 +156,7 @@ if [ $stage -le 7 ]; then
 fi
 
 if [ $stage -le 8 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
       exp/tri3b/graph data/test ${dir}_online/decode_test || exit 1;
diff --git a/egs/hkust/s5/local/online/run_nnet2_ms.sh b/egs/hkust/s5/local/online/run_nnet2_ms.sh
index b935d86fa90..c3177e1136e 100755
--- a/egs/hkust/s5/local/online/run_nnet2_ms.sh
+++ b/egs/hkust/s5/local/online/run_nnet2_ms.sh
@@ -20,7 +20,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads"
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # Run the common stages of training, including training the iVector extractor
diff --git a/egs/librispeech/s5/local/nnet2/run_5c.sh b/egs/librispeech/s5/local/nnet2/run_5c.sh
index bf261b93910..956a8f09348 100755
--- a/egs/librispeech/s5/local/nnet2/run_5c.sh
+++ b/egs/librispeech/s5/local/nnet2/run_5c.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+#
 
 train_stage=-10
 use_gpu=true
@@ -16,8 +16,8 @@ test_sets="dev-clean dev-other"
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -28,7 +28,7 @@ EOF
   dir=exp/nnet5c_gpu_${train_set}
 else
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet5c_${train_set}
   minibatch_size=128
 fi
diff --git a/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh b/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh
index 4be618b6c6e..6c3d99828b5 100755
--- a/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh
+++ b/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh
@@ -2,7 +2,7 @@
 
 # This is p-norm neural net training, with the "fast" script, on top of adapted
 # 40-dimensional features.
-# This version uses 460 hours of "clean" (typically relatively un-accented) 
+# This version uses 460 hours of "clean" (typically relatively un-accented)
 # training data.
 # We're using 6 jobs rather than 4, for speed.
 
@@ -19,8 +19,8 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -32,7 +32,7 @@ EOF
 else
   # with just 4 jobs this might be a little slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp/nnet6a_clean_460
 fi
@@ -41,7 +41,7 @@ fi
 . utils/parse_options.sh
 
 if [ ! -f $dir/final.mdl ]; then
-  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then 
+  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
      # spread the egs over various machines.  will help reduce overload of any
      # one machine.
      utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$dir/egs/storage $dir/egs/storage
diff --git a/egs/librispeech/s5/local/nnet2/run_7a_960.sh b/egs/librispeech/s5/local/nnet2/run_7a_960.sh
index 2b873a94f10..2afcc2cd633 100755
--- a/egs/librispeech/s5/local/nnet2/run_7a_960.sh
+++ b/egs/librispeech/s5/local/nnet2/run_7a_960.sh
@@ -19,8 +19,8 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -32,7 +32,7 @@ EOF
 else
   # with just 4 jobs this might be a little slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp/nnet7a_960
 fi
@@ -41,7 +41,7 @@ fi
 . utils/parse_options.sh
 
 if [ ! -f $dir/final.mdl ]; then
-  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then 
+  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
      # spread the egs over various machines.  will help reduce overload of any
      # one machine.
      utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$dir/egs/storage $dir/egs/storage
diff --git a/egs/librispeech/s5/local/online/run_nnet2.sh b/egs/librispeech/s5/local/online/run_nnet2.sh
index 232794d102e..2f955699e18 100755
--- a/egs/librispeech/s5/local/online/run_nnet2.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -36,7 +36,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 local/online/run_nnet2_common.sh --stage $stage
@@ -101,7 +101,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for test in test_clean test_other dev_clean dev_other; do
     steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
diff --git a/egs/librispeech/s5/local/online/run_nnet2_disc.sh b/egs/librispeech/s5/local/online/run_nnet2_disc.sh
index 02e4d95831c..e60cde13b5c 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -39,19 +39,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -66,13 +66,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/librispeech/s5/local/online/run_nnet2_ms.sh b/egs/librispeech/s5/local/online/run_nnet2_ms.sh
index e0cee59d7fc..74e8e5021d8 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_ms.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 
@@ -46,7 +46,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -119,7 +119,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for test in test_clean test_other dev_clean dev_other; do
     (
diff --git a/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh b/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
index 85d6e90a534..f5b73ec4173 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -40,19 +40,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -67,13 +67,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # max total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_960_hires data/lang_pp $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang_pp ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh b/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh
index 62843627fab..d1b9de2d190 100755
--- a/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh
+++ b/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh
@@ -20,13 +20,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 
@@ -45,7 +45,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -117,7 +117,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for test in dev_clean dev_other; do
     steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
diff --git a/egs/lre/v1/lid/train_diag_ubm.sh b/egs/lre/v1/lid/train_diag_ubm.sh
index 60f2452f3b7..8ba703073c0 100755
--- a/egs/lre/v1/lid/train_diag_ubm.sh
+++ b/egs/lre/v1/lid/train_diag_ubm.sh
@@ -29,7 +29,7 @@ cleanup=true
 min_gaussian_weight=0.0001
 remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
 num_threads=32
-parallel_opts="-pe smp 32"
+parallel_opts="--num-threads 32"
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -49,7 +49,7 @@ if [ $# != 3 ]; then
   echo "  --stage <stage|-2>                               # stage to do partial re-run from."
   echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
   echo "                                                   # limit computation to, for speed"
-  echo " --subsample <n|5>                                 # In main E-M phase, use every n" 
+  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
   echo "                                                   # frames (a speedup)"
   echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
   echo "                                                   # for model initialization"
@@ -59,7 +59,7 @@ if [ $# != 3 ]; then
   echo "                                                   # in initialization phase (then split)"
   echo " --num-threads <n|32>                              # number of threads to use in initialization"
   echo "                                                   # phase (must match with parallel-opts option)"
-  echo " --parallel-opts <string|'-pe smp 32'>             # Option should match number of threads in"
+  echo " --parallel-opts <string|'--num-threads 32'>             # Option should match number of threads in"
   echo "                                                   # --num-threads option above"
   echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
   echo "                                                   # initialization (this relatively high"
diff --git a/egs/lre/v1/lid/train_ivector_extractor.sh b/egs/lre/v1/lid/train_ivector_extractor.sh
index 8e238985f99..18f536a60cb 100755
--- a/egs/lre/v1/lid/train_ivector_extractor.sh
+++ b/egs/lre/v1/lid/train_ivector_extractor.sh
@@ -13,7 +13,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -24,8 +24,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -84,7 +84,7 @@ nj_full=$[$nj*$num_processes]
 sdata=$data/split$nj_full;
 utils/split_data.sh $data $nj_full || exit 1;
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 
 feats="ark,s,cs:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:$sdata/JOB/feats.scp ark:- | add-deltas-sdc ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
@@ -97,7 +97,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -146,7 +146,7 @@ while [ $x -lt $num_iters ]; do
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
+	$cmd --num-threads $nt $dir/log/update.$x.log \
 	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
 	rm $dir/acc.$x.*
     if $cleanup; then
diff --git a/egs/lre/v1/run.sh b/egs/lre/v1/run.sh
index 740fad7aceb..bc0f8db572d 100755
--- a/egs/lre/v1/run.sh
+++ b/egs/lre/v1/run.sh
@@ -50,9 +50,9 @@ rm foo
 local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
 
 # This commented script is an alternative to the above utterance
-# splitting method. Here we split the utterance based on the number of 
+# splitting method. Here we split the utterance based on the number of
 # frames which are voiced, rather than the total number of frames.
-# max_voiced=3000 
+# max_voiced=3000
 # local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
 
 use_vtln=true
@@ -61,7 +61,7 @@ if $use_vtln; then
     cp -rt data/${t} data/${t}_novtln
     rm -r data/${t}_novtln/{split,.backup,spk2warp} 2>/dev/null || true
     steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 100 --cmd "$train_cmd" \
-       data/${t}_novtln exp/make_mfcc $mfccdir 
+       data/${t}_novtln exp/make_mfcc $mfccdir
     lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir
   done
   # Vtln-related things:
@@ -115,7 +115,7 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
 # Alternatively, a diagonal UBM can replace the full UBM used above.
 # The preceding calls to train_diag_ubm.sh and train_full_ubm.sh
 # can be commented out and replaced with the following lines.
-# 
+#
 # This results in a slight degradation but could improve error rate when
 # there is less training data than used in this example.
 #
@@ -125,12 +125,12 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
 #gmm-global-to-fgmm exp/diag_ubm_2048/final.dubm \
 #  exp/full_ubm_2048/final.ubm
 
-lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
+lid/train_ivector_extractor.sh --cmd "$train_cmd --mem 2G" \
   --num-iters 5 exp/full_ubm_2048/final.ubm data/train \
   exp/extractor_2048
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/train exp/ivectors_train
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/lre07 exp/ivectors_lre07
diff --git a/egs/lre07/v1/lid/nnet2/get_egs2.sh b/egs/lre07/v1/lid/nnet2/get_egs2.sh
index 27cf82bd1a1..7806dce4894 100755
--- a/egs/lre07/v1/lid/nnet2/get_egs2.sh
+++ b/egs/lre07/v1/lid/nnet2/get_egs2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
 #                2015 David Snyder
 # Apache 2.0.
 #
@@ -54,7 +54,7 @@ transform_dir=     # If supplied, overrides alidir as the place to find fMLLR tr
 postdir=        # If supplied, we will use posteriors in it as soft training targets.
 
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 online_ivector_dir=  # can be used if we are including speaker information as iVectors.
 
@@ -83,7 +83,7 @@ if [ $# != 3 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -109,7 +109,7 @@ utils/split_data.sh $data $nj
 mkdir -p $dir/log $dir/info
 cp $alidir/tree $dir
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
 
@@ -129,7 +129,7 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
-## Set up features. 
+## Set up features.
 if [ -z $feat_type ]; then
   if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
 fi
@@ -140,7 +140,7 @@ case $feat_type in
     valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true  scp:- ark:- |"
     train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
     # caution: the top-level nnet training script should copy these to its own dir now.
     cp $alidir/{splice_opts,final.mat} $dir || exit 1;
@@ -280,13 +280,13 @@ if [ $stage -le 3 ]; then
     egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
   done
   echo "$0: Generating training examples on disk"
-  # The examples will go round-robin to egs_list. 
+  # The examples will go round-robin to egs_list.
   if [ ! -z $postdir ]; then
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       scp:$postdir/post.JOB.scp ark:- \| \
       nnet-copy-egs ark:- $egs_list || exit 1;
-  else 
+  else
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
@@ -299,7 +299,7 @@ if [ $stage -le 4 ]; then
   # shuffle the order, writing to the egs.JOB.ark
 
   egs_list=
-  for n in $(seq $nj); do 
+  for n in $(seq $nj); do
     egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
   done
 
diff --git a/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh b/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh
index 4809f42e633..533001934ab 100755
--- a/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh
+++ b/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -9,7 +9,7 @@
 
 # This is a modified version of train_multisplice_accel2.sh in
 # steps/nnet2/ for language recognition. The main difference is
-# that it uses different get_lda.sh and get_egs2.sh scripts. 
+# that it uses different get_lda.sh and get_egs2.sh scripts.
 #
 # The original train_multisplice_accel2.sh was a modified version of
 # train_pnorm_multisplice2.sh (still using pnorm).  The "accel" refers to the
@@ -25,11 +25,11 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -66,7 +66,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -78,11 +78,11 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
-combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
 cleanup=true
 egs_dir=
 lda_opts=
@@ -92,7 +92,7 @@ transform_dir=     # If supplied, overrides alidir
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -127,10 +127,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
   echo "                                                   # as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -148,7 +148,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -372,7 +372,7 @@ while [ $x -lt $num_iters ]; do
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -417,7 +417,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -461,7 +461,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -500,7 +500,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
@@ -537,7 +537,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/lre07/v1/lid/train_diag_ubm.sh b/egs/lre07/v1/lid/train_diag_ubm.sh
index 70325cf529d..a5e256818ce 100755
--- a/egs/lre07/v1/lid/train_diag_ubm.sh
+++ b/egs/lre07/v1/lid/train_diag_ubm.sh
@@ -29,7 +29,7 @@ cleanup=true
 min_gaussian_weight=0.0001
 remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
 num_threads=32
-parallel_opts="-pe smp 32"
+parallel_opts="--num-threads 32"
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -59,7 +59,7 @@ if [ $# != 3 ]; then
   echo "                                                   # in initialization phase (then split)"
   echo " --num-threads <n|32>                              # number of threads to use in initialization"
   echo "                                                   # phase (must match with parallel-opts option)"
-  echo " --parallel-opts <string|'-pe smp 32'>             # Option should match number of threads in"
+  echo " --parallel-opts <string|'--num-threads 32'>             # Option should match number of threads in"
   echo "                                                   # --num-threads option above"
   echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
   echo "                                                   # initialization (this relatively high"
diff --git a/egs/lre07/v1/lid/train_ivector_extractor.sh b/egs/lre07/v1/lid/train_ivector_extractor.sh
index a73bd67cbc1..55bd54bb275 100755
--- a/egs/lre07/v1/lid/train_ivector_extractor.sh
+++ b/egs/lre07/v1/lid/train_ivector_extractor.sh
@@ -146,7 +146,7 @@ while [ $x -lt $num_iters ]; do
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-    $cmd -pe smp $nt $dir/log/update.$x.log \
+    $cmd --num-threads $nt $dir/log/update.$x.log \
       ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
     rm $dir/acc.$x.*
     $cleanup && rm $dir/acc.$x $dir/$x.ie
diff --git a/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh b/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
index 9f8fc60292b..573258e7b88 100755
--- a/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
+++ b/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
@@ -164,7 +164,7 @@ while [ $x -lt $num_iters ]; do
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-    $cmd -pe smp $nt $dir/log/update.$x.log \
+    $cmd --num-threads $nt $dir/log/update.$x.log \
       ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
     rm $dir/acc.$x.*
     $cleanup && rm $dir/acc.$x $dir/$x.ie
diff --git a/egs/lre07/v1/run.sh b/egs/lre07/v1/run.sh
index a4ff4d909ba..8664494e558 100755
--- a/egs/lre07/v1/run.sh
+++ b/egs/lre07/v1/run.sh
@@ -127,12 +127,12 @@ utils/subset_data_dir.sh data/train 5000 data/train_5k
 utils/subset_data_dir.sh data/train 10000 data/train_10k
 
 
-lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd --mem 20G" \
   data/train_5k 2048 exp/diag_ubm_2048
-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd --mem 20G" \
   data/train_10k exp/diag_ubm_2048 exp/full_ubm_2048_10k
 
-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd --mem 35G" \
   data/train exp/full_ubm_2048_10k exp/full_ubm_2048
 
 # Alternatively, a diagonal UBM can replace the full UBM used above.
@@ -148,7 +148,7 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
 #gmm-global-to-fgmm exp/diag_ubm_2048/final.dubm \
 #  exp/full_ubm_2048/final.ubm
 
-lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+lid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --use-weights true \
   --num-iters 5 exp/full_ubm_2048/final.ubm data/train \
   exp/extractor_2048
@@ -162,10 +162,10 @@ utils/fix_data_dir.sh data/train_lr
 echo "**Language count for logistic regression training (after splitting long utterances):**"
 awk '{print $2}' data/train_lr/utt2lang | sort | uniq -c | sort -nr
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/train_lr exp/ivectors_train
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/lre07 exp/ivectors_lre07
 
 lid/run_logistic_regression.sh --prior-scale 0.70 \
diff --git a/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh b/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh
index a223e12333f..51fcf401cb2 100755
--- a/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh
+++ b/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh
@@ -19,13 +19,13 @@ set -e
 # assume use_gpu=true since it would be way too slow otherwise.
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1" 
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
@@ -40,10 +40,10 @@ if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
     utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-data/egs/lre07/v2/$dir/egs/storage
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
diff --git a/egs/lre07/v2/run.sh b/egs/lre07/v2/run.sh
index 82be9a555ae..ff193274750 100755
--- a/egs/lre07/v2/run.sh
+++ b/egs/lre07/v2/run.sh
@@ -6,7 +6,7 @@
 # This script runs the NIST 2007 General Language Recognition Closed-Set
 # evaluation.
 # This example script shows how to replace the GMM-UBM
-# with a DNN trained for ASR. 
+# with a DNN trained for ASR.
 
 . cmd.sh
 . path.sh
@@ -157,12 +157,12 @@ utils/fix_data_dir.sh data/train_dnn_32k
 # Initialize a full GMM from the DNN posteriors and language recognition
 # features. This can be used both alone, as a UBM, or to initialize the
 # i-vector extractor in a DNN-based system.
-lid/init_full_ubm_from_dnn.sh --nj 40 --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \
+lid/init_full_ubm_from_dnn.sh --nj 40 --cmd "$train_cmd --mem 6G" \
   data/train_32k \
   data/train_dnn_32k $nnet exp/full_ubm
 
 # Train an i-vector extractor based on the DNN-UBM.
-lid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
+lid/train_ivector_extractor_dnn.sh --cmd "$train_cmd --mem 80G" \
   --min-post 0.015 \
   --ivector-dim 600 \
   --num-iters 5 \
@@ -189,14 +189,14 @@ echo "**Language count for logistic regression training (after splitting long ut
 awk '{print $2}' data/train_lr_dnn/utt2lang | sort | uniq -c | sort -nr
 
 # Extract i-vectors using the extractor with the DNN-UBM
-lid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=30G,ram_free=30G" \ 
+lid/extract_ivectors_dnn.sh --cmd "$train_cmd --mem 30G" \
   --nj 40 exp/extractor_dnn \
   $nnet \
   data/train_lr \
   data/train_lr_dnn \
   exp/ivectors_train
 
-lid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=30G,ram_free=30G" \
+lid/extract_ivectors_dnn.sh --cmd "$train_cmd --mem 30G" \
   --nj 40 exp/extractor_dnn \
   $nnet \
   data/lre07 \
@@ -205,7 +205,7 @@ lid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=30G,ram_free=30G" \
 
 # Train a logistic regression model on top of i-Vectors
 lid/run_logistic_regression.sh --prior-scale 0.70 \
-  --conf conf/logistic-regression.conf 
+  --conf conf/logistic-regression.conf
 
 # General LR 2007 closed-set eval
 local/lre07_eval/lre07_eval.sh exp/ivectors_lre07 \
diff --git a/egs/rm/s5/local/nnet2/run_4b_gpu.sh b/egs/rm/s5/local/nnet2/run_4b_gpu.sh
index 34a5cd34f7e..9cde9f1694e 100755
--- a/egs/rm/s5/local/nnet2/run_4b_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_4b_gpu.sh
@@ -16,7 +16,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.
 EOF
 
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 . utils/parse_options.sh  # to parse the --stage option, if given
 
diff --git a/egs/rm/s5/local/nnet2/run_4c.sh b/egs/rm/s5/local/nnet2/run_4c.sh
index 2b580fe29d6..b3060c46ca0 100755
--- a/egs/rm/s5/local/nnet2/run_4c.sh
+++ b/egs/rm/s5/local/nnet2/run_4c.sh
@@ -14,20 +14,20 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet4c_gpu
 else
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet4c
 fi
 
diff --git a/egs/rm/s5/local/nnet2/run_4d.sh b/egs/rm/s5/local/nnet2/run_4d.sh
index 69b0352744c..74db2a7702b 100755
--- a/egs/rm/s5/local/nnet2/run_4d.sh
+++ b/egs/rm/s5/local/nnet2/run_4d.sh
@@ -14,13 +14,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet4d_gpu
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet4d
 fi
 
diff --git a/egs/rm/s5/local/nnet2/run_4d2.sh b/egs/rm/s5/local/nnet2/run_4d2.sh
index 123b52f7590..426c623a502 100755
--- a/egs/rm/s5/local/nnet2/run_4d2.sh
+++ b/egs/rm/s5/local/nnet2/run_4d2.sh
@@ -12,13 +12,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet4d2_gpu
@@ -27,7 +27,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet4d2
 fi
 
diff --git a/egs/rm/s5/local/nnet2/run_4d3.sh b/egs/rm/s5/local/nnet2/run_4d3.sh
index a9ff7a9461f..f7cdc717176 100755
--- a/egs/rm/s5/local/nnet2/run_4d3.sh
+++ b/egs/rm/s5/local/nnet2/run_4d3.sh
@@ -16,19 +16,19 @@ dir=exp/nnet4d3
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
diff --git a/egs/rm/s5/local/nnet2/run_4e_gpu.sh b/egs/rm/s5/local/nnet2/run_4e_gpu.sh
index 0bdbadb745e..c9b34c0d625 100755
--- a/egs/rm/s5/local/nnet2/run_4e_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_4e_gpu.sh
@@ -2,7 +2,7 @@
 
 # This is GPU based pnorm neural net ensemble training on top of adapted 40-dimensional features.
 
-parallel_opts="-l gpu=1" 
+parallel_opts="--gpu 1" 
 
 . cmd.sh
 
diff --git a/egs/rm/s5/local/nnet2/run_5c.sh b/egs/rm/s5/local/nnet2/run_5c.sh
index 0146032a792..fcc6ee4b208 100755
--- a/egs/rm/s5/local/nnet2/run_5c.sh
+++ b/egs/rm/s5/local/nnet2/run_5c.sh
@@ -22,15 +22,15 @@ train_stage=-100
 nj=8
 
 if [ $stage -le 0 ]; then
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
     --beam 20.0 --lattice-beam 10.0 \
     --transform-dir exp/tri3b_ali \
     data/train data/lang exp/nnet4c exp/nnet4c_denlats
 fi
 
 if [ $stage -le 1 ]; then
-  steps/nnet2/align.sh  --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
+  steps/nnet2/align.sh  --cmd "$decode_cmd --mem 1G" \
     --transform-dir exp/tri3b_ali \
     --nj $nj data/train data/lang exp/nnet4c exp/nnet4c_ali
 fi
diff --git a/egs/rm/s5/local/nnet2/run_5c_gpu.sh b/egs/rm/s5/local/nnet2/run_5c_gpu.sh
index 770b1682524..219e2cb808e 100755
--- a/egs/rm/s5/local/nnet2/run_5c_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_5c_gpu.sh
@@ -7,21 +7,21 @@
 # at the end of the directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -29,8 +29,8 @@ EOF
 
 nj=$(cat exp/tri3b_ali/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --beam 20.0 --lattice-beam 10.0 \
       --transform-dir exp/tri3b_ali \
      data/train data/lang exp/nnet4c_gpu exp/nnet4c_gpu_denlats
diff --git a/egs/rm/s5/local/nnet2/run_5d.sh b/egs/rm/s5/local/nnet2/run_5d.sh
index 431cc6bb93b..56b102c80f5 100755
--- a/egs/rm/s5/local/nnet2/run_5d.sh
+++ b/egs/rm/s5/local/nnet2/run_5d.sh
@@ -27,14 +27,14 @@ nj_orig=$(cat $transform_dir/num_jobs)
 if $use_gpu; then
   . ./cmd.sh
   . ./path.sh
-  ! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  ! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
-  align_gpu_opts="-l gpu=1"
+  align_gpu_opts="--gpu 1"
   use_gpu_flag="--use-gpu yes"
-  train_parallel_opts="-l gpu=1"
+  train_parallel_opts="--gpu 1"
   train_num_threads=1
   srcdir=exp/nnet4d_gpu
   dir=exp/nnet5d_mpe_gpu
@@ -42,7 +42,7 @@ EOF
 else
   align_gpu_opts=
   use_gpu_flag="--use-gpu no"
-  train_parallel_opts="-pe smp 6"
+  train_parallel_opts="--num-threads 6"
   train_num_threads=6
   srcdir=exp/nnet4d
   dir=exp/nnet5d_mpe
@@ -64,7 +64,7 @@ fi
 # wasteful since the lattice determinization and graph search use up a fair
 # amount of CPU, and we'd be idling the GPU much of the time.
 
-# We specify 1G each for the mem_free and ram_free which, is per thread... it
+# We specify 1G each for --mem, which is per thread... it
 # will likely be less than the default.  Increase the beam relative to the
 # defaults; this is just for this RM setup, where the default beams will likely
 # generate very thin lattices.
@@ -74,8 +74,8 @@ fi
 
 
 if [ $stage -le 0 ]; then
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
     --beam 20.0 --lattice-beam 10.0 \
     --transform-dir $transform_dir \
     data/train data/lang $srcdir ${srcdir}_denlats
diff --git a/egs/rm/s5/local/nnet2/run_5d_gpu.sh b/egs/rm/s5/local/nnet2/run_5d_gpu.sh
index 209327fd43f..f83cd3db20a 100755
--- a/egs/rm/s5/local/nnet2/run_5d_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_5d_gpu.sh
@@ -7,21 +7,21 @@
 # at the end of the directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -29,8 +29,8 @@ EOF
 
 nj=$(cat exp/tri3b_ali/num_jobs)
 dir=nnet4d_gpu
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --beam 20.0 --lattice-beam 10.0 \
       --transform-dir exp/tri3b_ali \
      data/train data/lang exp/$dir exp/$dir_denlats
diff --git a/egs/rm/s5/local/nnet2/run_5e_gpu.sh b/egs/rm/s5/local/nnet2/run_5e_gpu.sh
index 11b85b2bef7..37c9fb4238d 100755
--- a/egs/rm/s5/local/nnet2/run_5e_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_5e_gpu.sh
@@ -7,21 +7,21 @@
 # at the end of the directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -29,8 +29,8 @@ EOF
 
 nj=$(cat exp/tri3b_ali/num_jobs)
 dir=nnet4e_gpu
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --beam 20.0 --lattice-beam 10.0 \
       --transform-dir exp/tri3b_ali \
      data/train data/lang exp/$dir exp/$dir_denlats
diff --git a/egs/rm/s5/local/online/run_nnet2.sh b/egs/rm/s5/local/online/run_nnet2.sh
index 2ab5fb5ffaf..18d66730640 100755
--- a/egs/rm/s5/local/online/run_nnet2.sh
+++ b/egs/rm/s5/local/online/run_nnet2.sh
@@ -15,13 +15,13 @@ dir=exp/nnet2_online/nnet_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -119,7 +119,7 @@ exit 0;
 %WER 2.20 [ 276 / 12533, 37 ins, 61 del, 178 sub ] exp/nnet2_online/nnet_a/decode/wer_5
 %WER 10.22 [ 1281 / 12533, 143 ins, 193 del, 945 sub ] exp/nnet2_online/nnet_a/decode_ug/wer_10
 
-# This is the baseline with spliced non-CMVN cepstra and no iVector input. 
+# This is the baseline with spliced non-CMVN cepstra and no iVector input.
 # The difference is pretty small on RM; I expect it to be more clear-cut on larger corpora.
 %WER 2.30 [ 288 / 12533, 35 ins, 57 del, 196 sub ] exp/nnet2_online/nnet_gpu_baseline/decode/wer_5
 %WER 10.98 [ 1376 / 12533, 121 ins, 227 del, 1028 sub ] exp/nnet2_online/nnet_gpu_baseline/decode_ug/wer_10
diff --git a/egs/rm/s5/local/online/run_nnet2_baseline.sh b/egs/rm/s5/local/online/run_nnet2_baseline.sh
index f8c31a132a3..b44be65142b 100755
--- a/egs/rm/s5/local/online/run_nnet2_baseline.sh
+++ b/egs/rm/s5/local/online/run_nnet2_baseline.sh
@@ -20,13 +20,13 @@ dir=exp/nnet2_online/nnet_a_baseline
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -34,7 +34,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh
index c9227797c63..1cd8abfba54 100755
--- a/egs/rm/s5/local/online/run_nnet2_common.sh
+++ b/egs/rm/s5/local/online/run_nnet2_common.sh
@@ -11,13 +11,13 @@ stage=1
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -25,7 +25,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet
 fi
 
diff --git a/egs/rm/s5/local/online/run_nnet2_multisplice.sh b/egs/rm/s5/local/online/run_nnet2_multisplice.sh
index 4a39113c8b2..813d69c6d32 100755
--- a/egs/rm/s5/local/online/run_nnet2_multisplice.sh
+++ b/egs/rm/s5/local/online/run_nnet2_multisplice.sh
@@ -15,13 +15,13 @@ dir=exp/nnet2_online/nnet_ms_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -117,7 +117,7 @@ exit 0;
 
 
 # see ../../RESULTS for results.  It's about the same as the non-multisplice
-# recipe, but I'm not doing much tuning on RM... it has too little data 
+# recipe, but I'm not doing much tuning on RM... it has too little data
 # for any of these DNN things to really work well
 
 
diff --git a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
index 2bddefdac04..9aff25b569d 100755
--- a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
+++ b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
@@ -26,19 +26,19 @@ fi
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ $stage -le 1 ]; then
@@ -46,7 +46,7 @@ if [ $stage -le 1 ]; then
   # otherwise on RM we'd get very thin lattices.
   nj=30
   num_threads_denlats=6
-  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --nj $nj --sub-split 40 --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train data/lang $srcdir ${srcdir}_denlats || exit 1;
 fi
@@ -63,7 +63,7 @@ fi
 
 
 if [ $stage -le 3 ]; then
-  # I tested the following with  --max-temp-archives 3 
+  # I tested the following with  --max-temp-archives 3
   # to test other branches of the code.
   # the --max-jobs-run 5 limits the I/O.
   steps/online/nnet2/get_egs_discriminative2.sh \
diff --git a/egs/rm/s5/local/online/run_nnet2_perturbed.sh b/egs/rm/s5/local/online/run_nnet2_perturbed.sh
index c018ca2880b..6f304cab59f 100755
--- a/egs/rm/s5/local/online/run_nnet2_perturbed.sh
+++ b/egs/rm/s5/local/online/run_nnet2_perturbed.sh
@@ -17,13 +17,13 @@ dir=exp/nnet2_online/nnet_perturbed
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -31,7 +31,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then
 fi
 
 if [ $stage -le 2 ]; then
-  # Note: if you've already run run_online_decoding_nnet2.sh you can 
+  # Note: if you've already run run_online_decoding_nnet2.sh you can
   # skip this stage.
   # use a smaller iVector dim (50) than the default (100) because RM has a very
   # small amount of data.
@@ -79,7 +79,7 @@ if [ $stage -le 5 ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$ivectordir $ivectordir/storage
   fi
   # Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
-  # of speakers into "fake-speakers" with about 2 utterances each, by randomly making 
+  # of speakers into "fake-speakers" with about 2 utterances each, by randomly making
   # some have 2 and some 3 utterances... this randomness will be different in different
   # copies of the data.
   steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
diff --git a/egs/rm/s5/local/online/run_nnet2_wsj.sh b/egs/rm/s5/local/online/run_nnet2_wsj.sh
index 1e5c5d10343..e22d450a387 100755
--- a/egs/rm/s5/local/online/run_nnet2_wsj.sh
+++ b/egs/rm/s5/local/online/run_nnet2_wsj.sh
@@ -19,13 +19,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet2_online_wsj/nnet_a
@@ -37,7 +37,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online_wsj/nnet_a
   trainfeats=exp/nnet2_online_wsj/wsj_activations_train
   srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
@@ -58,7 +58,7 @@ fi
 
 if [ $stage -le 1 ]; then
   echo "$0: training 0-hidden-layer model on top of WSJ activations"
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then    
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
@@ -70,7 +70,7 @@ if [ $stage -le 1 ]; then
     --num-jobs-nnet 4 \
     --mix-up 4000 \
     --initial-learning-rate 0.02 --final-learning-rate 0.004 \
-     $trainfeats/data data/lang exp/tri3b_ali $dir 
+     $trainfeats/data data/lang exp/tri3b_ali $dir
 fi
 
 if [ $stage -le 2 ]; then
@@ -100,10 +100,10 @@ fi
 
 ## From this point on we try something else: we try training all the layers of
 ## the model on this dataset.  First we need to create a combined version of the
-## model. 
+## model.
 if [ $stage -le 5 ]; then
   steps/nnet2/create_appended_model.sh $srcdir $dir ${dir}_combined_init
-  
+
   # Set the learning rate in this initial value to our guess of a suitable value.
   # note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
   # (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
@@ -112,7 +112,7 @@ if [ $stage -le 5 ]; then
 fi
 
 if [ $stage -le 6 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then    
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
       /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/${dir}_combined/egs/storage \
         $dir_combined/egs/storage
@@ -129,7 +129,7 @@ if [ $stage -le 7 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined 
+     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined
 fi
 
 if [ $stage -le 8 ]; then
@@ -159,9 +159,9 @@ fi
 exit 0;
 
 # Here are the results when we just retrain the last layer:
-# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh 
+# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh
 #%WER 1.60 [ 201 / 12533, 22 ins, 46 del, 133 sub ] exp/nnet2_online_wsj/nnet_a_online/decode/wer_3
-#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 8.02 [ 1005 / 12533, 74 ins, 155 del, 776 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_6
 
 # and with per-utterance decoding:
@@ -179,7 +179,7 @@ exit 0;
 # %WER 7.86 [ 985 / 12533, 59 ins, 171 del, 755 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug_per_utt/wer_8
 
 # And this is a suitable baseline: a system trained on RM only.
-#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh
 #%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_a_online/decode/wer_8
-#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_11
diff --git a/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh b/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh
index 5e8a2a8ab64..8064e8f31ac 100755
--- a/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh
+++ b/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh
@@ -5,8 +5,8 @@
 # Before running this script, go to ../../wsj/s5, and after running
 # the earlier stages in the run.sh (so the baseline SAT system is built),
 # run the following:
-# 
-# local/online/run_nnet2.sh --stage 8 --dir exp/nnet2_online/nnet_ms_a_partial --exit-train-stage 15    
+#
+# local/online/run_nnet2.sh --stage 8 --dir exp/nnet2_online/nnet_ms_a_partial --exit-train-stage 15
 #
 # (you may want to keep --stage 8 on the above command line after run_nnet2.sh,
 # in case you already ran some scripts in local/online/ in ../../wsj/s5/ and
@@ -29,24 +29,24 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # Check inputs.
 for f in $srcdir/egs/egs.1.ark $srcdir/egs/info/egs_per_archive \
-    ${srcdir}_online/final.mdl $src_alidir/ali.1.gz; do 
+    ${srcdir}_online/final.mdl $src_alidir/ali.1.gz; do
   [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
 done
 
@@ -59,7 +59,7 @@ if [ $stage -le 0 ]; then
   steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
 fi
 
-if [ $stage -le 1 ]; then 
+if [ $stage -le 1 ]; then
   echo "$0: dumping egs for RM data"
   steps/online/nnet2/get_egs2.sh --cmd "$train_cmd" \
     data/train_max2 exp/tri3b_ali ${srcdir}_online ${dir}/egs
diff --git a/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh b/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh
index 891409dcd1f..15b039b8ac9 100755
--- a/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh
+++ b/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh
@@ -3,11 +3,11 @@
 
 # this script is discriminative training after multi-language training (as
 # run_nnet2_gale_combined_disc1.sh), but the discriminative training is
-# multi-language too. 
+# multi-language too.
 # some of the stages are the same as run_nnet2_gale_combined_disc1.sh,
 # and we didn't repeat them (we used the --stage option, it defaults to 4).
 
-# This script is to be run after run_nnet2_gale_combined.sh.  
+# This script is to be run after run_nnet2_gale_combined.sh.
 # It's discriminative training, using just the BOLT data.
 # note, the _filt data has some bad conversations removed, that
 # weren't aligning.
@@ -38,7 +38,7 @@ if [ $stage -le 1 ]; then
   sub_split=100
   num_threads=6
 
-  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads" \
+  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads" \
       --nj $nj --sub-split $sub_split --num-threads "$num_threads" \
      $data_wsj $lang_wsj ${dir}_wsj_online ${dir}_wsj_denlats
 fi
@@ -73,7 +73,7 @@ if [ $stage -le 4 ]; then
   num_threads=6
 
   steps/online/nnet2/make_denlats.sh \
-      --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads" \
+      --cmd "$decode_cmd --mem 1G --num-threads $num_threads" \
       --nj $nj --sub-split $sub_split --num-threads "$num_threads" \
      $data_rm $lang_rm  ${dir}_rm_online ${dir}_rm_denlats
 fi
@@ -102,8 +102,8 @@ if [ $stage -le 6 ]; then
 fi
 
 if [ $stage -le 7 ]; then
-  
-  steps/nnet2/train_discriminative_multilang2.sh --cmd "$decode_cmd -l gpu=1" --stage $train_stage \
+
+  steps/nnet2/train_discriminative_multilang2.sh --cmd "$decode_cmd --gpu 1" --stage $train_stage \
     --learning-rate $learning_rate --num-jobs-nnet "4 1" \
     --criterion $criterion --drop-frames $drop_frames \
     --num-epochs $num_epochs --num-threads 1 \
diff --git a/egs/rm/s5/local/run_dnn_convert_nnet2.sh b/egs/rm/s5/local/run_dnn_convert_nnet2.sh
index 96fd0bc2193..664ecf3f80b 100755
--- a/egs/rm/s5/local/run_dnn_convert_nnet2.sh
+++ b/egs/rm/s5/local/run_dnn_convert_nnet2.sh
@@ -51,7 +51,7 @@ steps/nnet2/decode.sh --nj 10 --cmd "$decode_cmd" \
 
  # options here are for GPU use.
   steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
-    --parallel-opts "-l gpu=1" --num-threads 1  --minibatch-size 512 \
+    --parallel-opts "--gpu 1" --num-threads 1  --minibatch-size 512 \
     exp/dnn4b_nnet2/final.mdl.mod exp/dnn4b_nnet2_retrain/egs exp/dnn4b_nnet2_retrain
 
   steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode \
@@ -91,7 +91,7 @@ steps/nnet2/decode.sh --nj 10 --cmd "$decode_cmd" \
       exp/dnn4b_nnet2_dbn_retrain
 
   steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
-    --parallel-opts "-l gpu=1" --num-threads 1  --minibatch-size 512 \
+    --parallel-opts "--gpu 1" --num-threads 1  --minibatch-size 512 \
     exp/dnn4b_nnet2_dbn/final.mdl.mod exp/dnn4b_nnet2_dbn_retrain/egs exp/dnn4b_nnet2_dbn_retrain
 
 
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh
index 8f858da739d..4f88b2334f4 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh
@@ -59,7 +59,7 @@ if [ $stage -le 2 ]; then
 
   steps/nnet2/train_block.sh --stage "$train_stage" \
      --num-threads 1 --max-change 40.0 --minibatch-size 512 --num-jobs-nnet 8 \
-     --parallel-opts "-l gpu=1" \
+     --parallel-opts "--gpu 1" \
      --initial-learning-rate 0.0075 --final-learning-rate 0.00075 \
      --num-epochs 10 --num-epochs-extra 5 \
      --cmd "$decode_cmd" \
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh
index 55017386f08..00bd16bf00f 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh
@@ -9,7 +9,7 @@
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5c2_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
@@ -18,7 +18,7 @@ dir=exp/nnet5c2_gpu
 . ./cmd.sh
 . utils/parse_options.sh
 
-( 
+(
 
   if [ ! -z "$temp_dir" ] && [ ! -e $dir/egs ]; then
     mkdir -p $dir
@@ -33,7 +33,7 @@ dir=exp/nnet5c2_gpu
    --num-hidden-layers 4 --hidden-layer-dim 1024 \
    --cmd "$decode_cmd" \
     data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
-  
+
   steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
      exp/tri4b/graph_bd_tgpr data/test_dev93 $dir/decode_bd_tgpr_dev93
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh
index 4aaafde4eb5..2bf13a0a399 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+#
 
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5c_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
@@ -15,7 +15,7 @@ dir=exp/nnet5c_gpu
 . ./cmd.sh
 . utils/parse_options.sh
 
-( 
+(
 
   if [ ! -z "$temp_dir" ] && [ ! -e $dir/egs ]; then
     mkdir -p $dir
@@ -30,7 +30,7 @@ dir=exp/nnet5c_gpu
    --num-hidden-layers 4 --hidden-layer-dim 1024 \
    --cmd "$decode_cmd" \
     data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
-  
+
   steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
      exp/tri4b/graph_bd_tgpr data/test_dev93 $dir/decode_bd_tgpr_dev93
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
index f52a0028074..1b87fec6419 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
@@ -5,7 +5,7 @@
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5d_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
diff --git a/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh
index eee51fd9c9b..e61843ef4b4 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh
@@ -7,7 +7,7 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
@@ -15,7 +15,7 @@ gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -25,8 +25,8 @@ set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5c_gpu exp/nnet5c_gpu_denlats
 
diff --git a/egs/sre08/v1/local/run_more_data.sh b/egs/sre08/v1/local/run_more_data.sh
index db7f14615a8..001e7ff4d23 100755
--- a/egs/sre08/v1/local/run_more_data.sh
+++ b/egs/sre08/v1/local/run_more_data.sh
@@ -49,7 +49,7 @@ sid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/sre08_test_short3_mal
 
 
 # Note: to see the proportion of voiced frames you can do,
-# grep Prop exp/make_vad/vad_*.1.log 
+# grep Prop exp/make_vad/vad_*.1.log
 
 # Get male and female subsets of training data.
 grep -w m data/train/spk2gender | awk '{print $1}' > foo;
@@ -78,20 +78,20 @@ sid/train_full_ubm.sh --nj 30 --remove-low-count-gaussians false --num-iters 1 -
    data/train_female_4k exp/full_ubm_2048 exp/full_ubm_2048_female &
 wait
 
-# note, the mem_free,ram_free is counted per thread... in this setup each
+# note, the --mem is counted per thread... in this setup each
 # job has 4 processes running each with 4 threads; each job takes about 5G
 # of memory so we need about 20G, plus add memory for sum-accs to make it 25G.
-# but we'll submit using -pe smp 16, and this multiplies the memory requirement
+# but we'll submit using --num-threads 16, and this multiplies the memory requirement
 # by 16, so submitting with 2G as the requirement, to make the total requirement
 # 32, is reasonable.
 
 # Train the iVector extractor for male speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 2G" \
   --num-iters 5 exp/full_ubm_2048_male/final.ubm data/train_male \
   exp/extractor_2048_male
 
 # The same for female speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 2G" \
   --num-iters 5 exp/full_ubm_2048_female/final.ubm data/train_female \
   exp/extractor_2048_female
 
@@ -105,22 +105,22 @@ sid/gender_id.sh --cmd "$train_cmd" --nj 150 exp/full_ubm_2048{,_male,_female} \
 # Gender-id error rate is 2.58%
 
 # Extract the iVectors for the Fisher data.
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_male data/train_male exp/ivectors_train_male
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_female data/train_female exp/ivectors_train_female
 
 # .. and for the SRE08 training and test data. (We focus on the main
 # evaluation condition, the only required one in that eval, which is
 # the short2-short3 eval.)
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_female data/sre08_train_short2_female exp/ivectors_sre08_train_short2_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_male data/sre08_train_short2_male exp/ivectors_sre08_train_short2_male
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_female data/sre08_test_short3_female exp/ivectors_sre08_test_short3_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_male data/sre08_test_short3_male exp/ivectors_sre08_test_short3_male
 
 
@@ -131,7 +131,7 @@ cat $trials | awk '{print $1, $2}' | \
  ivector-compute-dot-products - \
   scp:exp/ivectors_sre08_train_short2_female/spk_ivector.scp \
   scp:exp/ivectors_sre08_test_short3_female/spk_ivector.scp \
-   foo 
+   foo
 
 local/score_sre08.sh $trials foo
 
diff --git a/egs/sre08/v1/run.sh b/egs/sre08/v1/run.sh
index 4e31542bf4d..c4afe447e8d 100755
--- a/egs/sre08/v1/run.sh
+++ b/egs/sre08/v1/run.sh
@@ -110,12 +110,12 @@ sid/train_full_ubm.sh --nj 30 --remove-low-count-gaussians false \
 wait
 
 # Train the iVector extractor for male speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --num-iters 5 exp/full_ubm_2048_male/final.ubm data/train_male \
   exp/extractor_2048_male
 
 # The same for female speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --num-iters 5 exp/full_ubm_2048_female/final.ubm data/train_female \
   exp/extractor_2048_female
 
@@ -129,25 +129,25 @@ sid/gender_id.sh --cmd "$train_cmd" --nj 150 exp/full_ubm_2048{,_male,_female} \
 # Gender-id error rate is 3.41%
 
 # Extract the iVectors for the training data.
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_male data/train_male exp/ivectors_train_male
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_female data/train_female exp/ivectors_train_female
 
 # .. and for the SRE08 training and test data. (We focus on the main
 # evaluation condition, the only required one in that eval, which is
 # the short2-short3 eval.)
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_female data/sre08_train_short2_female \
   exp/ivectors_sre08_train_short2_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_male data/sre08_train_short2_male \
   exp/ivectors_sre08_train_short2_male
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_female data/sre08_test_short3_female \
   exp/ivectors_sre08_test_short3_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_male data/sre08_test_short3_male \
   exp/ivectors_sre08_test_short3_male
 
diff --git a/egs/sre08/v1/sid/nnet2/get_egs2.sh b/egs/sre08/v1/sid/nnet2/get_egs2.sh
index 9f1644178e2..05ea1d1a0cd 100755
--- a/egs/sre08/v1/sid/nnet2/get_egs2.sh
+++ b/egs/sre08/v1/sid/nnet2/get_egs2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
 #                2015 David Snyder
 # Apache 2.0.
 #
@@ -54,7 +54,7 @@ transform_dir=     # If supplied, overrides alidir as the place to find fMLLR tr
 postdir=        # If supplied, we will use posteriors in it as soft training targets.
 
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 online_ivector_dir=  # can be used if we are including speaker information as iVectors.
 
@@ -83,7 +83,7 @@ if [ $# != 3 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -109,7 +109,7 @@ utils/split_data.sh $data $nj
 mkdir -p $dir/log $dir/info
 cp $alidir/tree $dir
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
 
@@ -129,7 +129,7 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
-## Set up features. 
+## Set up features.
 if [ -z $feat_type ]; then
   if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
 fi
@@ -140,7 +140,7 @@ case $feat_type in
     valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true  scp:- ark:- |"
     train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
     # caution: the top-level nnet training script should copy these to its own dir now.
     cp $alidir/{splice_opts,final.mat} $dir || exit 1;
@@ -280,13 +280,13 @@ if [ $stage -le 3 ]; then
     egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
   done
   echo "$0: Generating training examples on disk"
-  # The examples will go round-robin to egs_list. 
+  # The examples will go round-robin to egs_list.
   if [ ! -z $postdir ]; then
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       scp:$postdir/post.JOB.scp ark:- \| \
       nnet-copy-egs ark:- $egs_list || exit 1;
-  else 
+  else
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
@@ -299,7 +299,7 @@ if [ $stage -le 4 ]; then
   # shuffle the order, writing to the egs.JOB.ark
 
   egs_list=
-  for n in $(seq $nj); do 
+  for n in $(seq $nj); do
     egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
   done
 
diff --git a/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh b/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
index 461a213c8ca..c56e89b5d94 100755
--- a/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
+++ b/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
@@ -66,7 +66,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -78,11 +78,11 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G"
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
-combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
 cleanup=true
 egs_dir=
 lda_opts=
@@ -127,10 +127,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
   echo "                                                   # as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/sre08/v1/sid/train_diag_ubm.sh b/egs/sre08/v1/sid/train_diag_ubm.sh
index 819a51ba73e..1e79fc10c99 100755
--- a/egs/sre08/v1/sid/train_diag_ubm.sh
+++ b/egs/sre08/v1/sid/train_diag_ubm.sh
@@ -60,7 +60,7 @@ if [ $# != 3 ]; then
   echo "                                                   # in initialization phase (then split)"
   echo " --num-threads <n|32>                              # number of threads to use in initialization"
   echo "                                                   # phase (must match with parallel-opts option)"
-  echo " --parallel-opts <string|'-pe smp 32'>             # Option should match number of threads in"
+  echo " --parallel-opts <string|'--num-threads 32'>             # Option should match number of threads in"
   echo "                                                   # --num-threads option above"
   echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
   echo "                                                   # initialization (this relatively high"
@@ -85,7 +85,7 @@ for f in $data/feats.scp $data/vad.scp; do
    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1
 done
 
-parallel_opts="-pe smp $num_threads"
+parallel_opts="--num-threads $num_threads"
 delta_opts="--delta-window=$delta_window --delta-order=$delta_order"
 echo $delta_opts > $dir/delta_opts
 
diff --git a/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh b/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
index c64b83c5a4b..2ce915a0750 100755
--- a/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
+++ b/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
@@ -29,9 +29,9 @@
 
 # Begin configuration section.
 nj=5   # this is the number of separate queue jobs we run, but each one
-        # contains num_processes sub-jobs.. the real number of threads we
-        # run is nj * num_processes * num_threads, and the number of
-        # separate pieces of data is nj * num_processes.
+       # contains num_processes sub-jobs.. the real number of threads we
+       # run is nj * num_processes * num_threads, and the number of
+       # separate pieces of data is nj * num_processes.
 num_threads=4
 num_processes=2 # each job runs this many processes, each with --num-threads threads
 cmd="run.pl"
diff --git a/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh b/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
index c8dc351536b..97b9789af0c 100755
--- a/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
+++ b/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
@@ -22,7 +22,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1"
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
@@ -39,7 +39,7 @@ if [ $stage -le 6 ]; then
 
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
diff --git a/egs/sre10/v2/cmd.sh b/egs/sre10/v2/cmd.sh
index 5c38b3a5d77..fe4cd0bcb3f 100755
--- a/egs/sre10/v2/cmd.sh
+++ b/egs/sre10/v2/cmd.sh
@@ -6,10 +6,10 @@
 # the number of cpus on your machine.
 
 #a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+#export cuda_cmd="queue --gpu 1"
+export mkgraph_cmd="queue.pl --mem 4G"
 
 #b) BUT cluster options
 #export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
diff --git a/egs/swahili/s5/cmd.sh b/egs/swahili/s5/cmd.sh
index ab1c23f76ef..8c9422b92bc 100755
--- a/egs/swahili/s5/cmd.sh
+++ b/egs/swahili/s5/cmd.sh
@@ -1,5 +1,5 @@
 # JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export cuda_cmd="queue --gpu 1"
+export mkgraph_cmd="queue.pl --mem 4G"
diff --git a/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh
index 940c99538cb..3aae7918964 100755
--- a/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh
@@ -18,7 +18,7 @@ EOF
 
 . utils/parse_options.sh
 
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ ! -f exp/nnet5a_gpu/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh
index 50f79208897..74058d9fac4 100755
--- a/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh
@@ -66,7 +66,7 @@ if [ $stage -le 2 ]; then
 
   steps/nnet2/train_block.sh --stage "$train_stage" \
      --num-threads 1 --max-change 40.0 --minibatch-size 512 \
-     --parallel-opts "-l gpu=1" \
+     --parallel-opts "--gpu 1" \
      --initial-learning-rate 0.01 --final-learning-rate 0.001 \
      --num-epochs 10 --num-epochs-extra 5 \
      --cmd "$decode_cmd" \
diff --git a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
index 36f72b77083..55becfbe0fc 100755
--- a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
@@ -20,7 +20,7 @@ EOF
 
 ( 
   if [ ! -f exp/nnet5c_gpu/final.mdl ]; then
-    steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "-l gpu=1" --io-opts "--max-jobs-run 5" \
+    steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "--gpu 1" --io-opts "--max-jobs-run 5" \
       --num-threads 1 --minibatch-size 512 --max-change 40.0 --mix-up 20000 --samples-per-iter 300000 \
       --num-epochs 10 --num-epochs-extra 3 --initial-learning-rate 0.0067 --final-learning-rate 0.00067 \
       --num-jobs-nnet 10 --num-hidden-layers 5 --hidden-layer-dim 1536 data/train_nodup data/lang \
diff --git a/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh
index 5364f14bcb6..e0b523910df 100755
--- a/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh
@@ -18,7 +18,7 @@ EOF
 
 
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ ! -f exp/$dir/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh
index 545c80c0e1c..77de59b90ff 100755
--- a/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh
@@ -18,7 +18,7 @@ train_stage=-10
 
 
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ ! -f exp/$dir/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh
index 3cc315a9775..b91599a27e6 100755
--- a/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh
@@ -9,7 +9,7 @@ dir=nnet5f_gpu
 . ./cmd.sh
 . ./path.sh
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
    if [ ! -f exp/$dir/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh b/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh
index 712c8e79c5b..6327ee85224 100755
--- a/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh
@@ -21,7 +21,7 @@ EOF
 
 . utils/parse_options.sh
 
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 alidir=exp/nnet5a_ali_100k_nodup
 if [ ! -f $alidir/.done ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh b/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh
index 8324051279b..0296f4cca00 100755
--- a/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh
@@ -7,7 +7,7 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1"                   # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"                   # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
@@ -18,8 +18,8 @@ set -e # exit on error.
 
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -28,15 +28,15 @@ EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
 # specify, since this system is on top of fMLLR features.
 
 if [ $stage -le 0 ]; then
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
     --transform-dir exp/tri4b \
     data/train_nodup data/lang exp/nnet5c_gpu exp/nnet5c_gpu_denlats
 fi
@@ -59,7 +59,7 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  for epoch in 1 2 3 4; do 
+  for epoch in 1 2 3 4; do
     for lm_suffix in tg fsh_tgpr; do
       steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter epoch$epoch \
         --config conf/decode.config --transform-dir exp/tri4b/decode_eval2000_sw1_${lm_suffix} \
diff --git a/egs/swbd/s5b/local/online/run_nnet2.sh b/egs/swbd/s5b/local/online/run_nnet2.sh
index ae788d333db..679829fe84e 100755
--- a/egs/swbd/s5b/local/online/run_nnet2.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2.sh
@@ -11,13 +11,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -26,7 +26,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 dir=exp/nnet2_online/nnet_a
 
@@ -42,7 +42,7 @@ if [ $stage -le 1 ]; then
       --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir;
   steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir;
 
-  # Remove the small number of utterances that couldn't be extracted for some 
+  # Remove the small number of utterances that couldn't be extracted for some
   # reason (e.g. too short; no such file).
   utils/fix_data_dir.sh data/train_hires;
 
@@ -52,7 +52,7 @@ if [ $stage -le 1 ]; then
       data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
   steps/compute_cmvn_stats.sh data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
     utils/fix_data_dir.sh data/eval2000_hires  # remove segments with problems
-    
+
   # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
   # the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
   # LM training data.   However, they will be in the lexicon, plus speakers
@@ -78,7 +78,7 @@ if [ $stage -le 2 ]; then
   # We need to build a small system just because we need the LDA+MLLT transform
   # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
   # the transform (12th iter is the last), any further training is pointless.
-  # this decision is based on fisher_english 
+  # this decision is based on fisher_english
   steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
     --splice-opts "--left-context=3 --right-context=3" \
     5500 90000 data/train_hires_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/nnet2_online/tri3b
@@ -116,7 +116,7 @@ if [ $stage -le 6 ]; then
   # Because we have a lot of data here and we don't want the training to take
   # too long so we reduce the number of epochs from the defaults (15 + 5) to (5
   # + 2), and the (initial,final) learning rate from the defaults (0.04, 0.004)
-  # to (0.01, 0.001). 
+  # to (0.01, 0.001).
   # decided to let others run their jobs too (we only have 10 GPUs on our queue
   # at JHU).  The number of parameters is smaller than the baseline system we had in
   # mind (../nnet2/run_5d_gpu.sh, which had pnorm input/output dim 3000/300 and
@@ -174,7 +174,7 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for lm_suffix in tg fsh_tgpr; do
     graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
@@ -248,7 +248,7 @@ for x in exp/nnet2_online/nnet_a/decode_eval2000_*; do grep Sum $x/score_*/*sys
 %WER 16.7 | 1831 21395 | 85.4 9.9 4.7 2.1 16.7 54.8 | exp/nnet2_online/nnet_a_online/decode_eval2000_hires_sw1_tg/score_10/eval2000_hires.ctm.swbd.filt.sys
 %WER 22.9 | 4459 42989 | 79.5 13.9 6.6 2.4 22.9 60.3 | exp/nnet2_online/nnet_a_online/decode_eval2000_hires_sw1_fsh_tgpr/score_11/eval2000_hires.ctm.filt.sys
 %WER 23.6 | 4459 42989 | 79.0 14.4 6.7 2.5 23.6 61.1 | exp/nnet2_online/nnet_a_online/decode_eval2000_hires_sw1_tg/score_11/eval2000_hires.ctm.filt.sys
- 
+
  # These are not updated: these results are from systems not using hi-res features
  # Here is the baseline experiment with no iVectors and no CMVN, also tested in batch mode.
  # It's around 1% worse than (with iVectors, batch-mode)
diff --git a/egs/swbd/s5b/local/online/run_nnet2_baseline.sh b/egs/swbd/s5b/local/online/run_nnet2_baseline.sh
index 2256ab2de5e..606ee0c5876 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_baseline.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_baseline.sh
@@ -12,13 +12,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -28,7 +28,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_a_baseline
 fi
 
diff --git a/egs/swbd/s5b/local/online/run_nnet2_fisher.sh b/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
index 2871fd3dfe7..d566034e763 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
@@ -18,27 +18,27 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet2_online_wsj/nnet_gpu
   trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
   srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
   # the following things are needed while training the combined model.
-  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu 
+  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu
   ivector_src=../../wsj/s5/exp/nnet2_online/extractor
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online_wsj/nnet
   trainfeats=exp/nnet2_online_wsj/wsj_activations_train
   srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
@@ -64,7 +64,7 @@ if [ $stage -le 1 ]; then
     --num-jobs-nnet 4 \
     --mix-up 4000 \
     --initial-learning-rate 0.02 --final-learning-rate 0.004 \
-     $trainfeats/data data/lang exp/tri3b_ali $dir 
+     $trainfeats/data data/lang exp/tri3b_ali $dir
 fi
 
 if [ $stage -le 2 ]; then
@@ -97,10 +97,10 @@ fi
 
 ## From this point on we try something else: we try training all the layers of
 ## the model on this dataset.  First we need to create a combined version of the
-## model. 
+## model.
 if [ $stage -le 5 ]; then
   steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
-  
+
   # Set the learning rate in this initial value to our guess of a suitable value.
   # note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
   # (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
@@ -140,7 +140,7 @@ if [ $stage -le 8 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined 
+     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined
 fi
 
 if [ $stage -le 9 ]; then
@@ -170,9 +170,9 @@ fi
 exit 0;
 
 # Here are the results when we just retrain the last layer:
-# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
+# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
 #%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
-#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
 
 # and with per-utterance decoding:
@@ -188,7 +188,7 @@ exit 0;
 # %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
 
 # And this is a suitable baseline: a system trained on RM only.
-#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
 #%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
-#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
diff --git a/egs/swbd/s5b/local/online/run_nnet2_ms.sh b/egs/swbd/s5b/local/online/run_nnet2_ms.sh
index b2e982ad754..ae6bad59d42 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_ms.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_ms.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -16,13 +16,13 @@ dir=exp/nnet2_online/nnet_ms_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -31,7 +31,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -89,7 +89,7 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for lm_suffix in tg fsh_tgpr; do
     graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
diff --git a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
index dc56a8371fb..3d6ec456cd8 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
@@ -8,7 +8,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -42,19 +42,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -69,13 +69,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_hires_nodup2 \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_hires_nodup data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh b/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh
index 87fcc2eb788..a3b081861ae 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh
@@ -7,7 +7,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=0
 train_stage=-10
 use_gpu=true
@@ -20,13 +20,13 @@ common_egs_dir=
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -35,7 +35,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 dir=exp/$nnet2_online/nnet_a
@@ -47,13 +47,13 @@ if [ $stage -le 0 ]; then
     date=$(date +'%m_%d_%H_%M')
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
   fi
-  
+
   utils/copy_data_dir.sh data/train data/train_hires
   steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir;
   steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir;
 
-  # Remove the small number of utterances that couldn't be extracted for some 
+  # Remove the small number of utterances that couldn't be extracted for some
   # reason (e.g. too short; no such file).
   utils/fix_data_dir.sh data/train_hires;
 
@@ -63,7 +63,7 @@ if [ $stage -le 0 ]; then
       data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
   steps/compute_cmvn_stats.sh data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
     utils/fix_data_dir.sh data/eval2000_hires  # remove segments with problems
-    
+
   # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
   # the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
   # LM training data.   However, they will be in the lexicon, plus speakers
@@ -88,7 +88,7 @@ if [ $stage -le 1 ]; then
   # We need to build a small system just because we need the LDA+MLLT transform
   # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
   # the transform (12th iter is the last), any further training is pointless.
-  # this decision is based on fisher_english 
+  # this decision is based on fisher_english
   steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
     --splice-opts "--left-context=3 --right-context=3" \
     5500 90000 data/train_hires_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/$nnet2_online/tri3b
@@ -116,7 +116,7 @@ if [ $stage -le 4 ]; then
   utils/perturb_data_dir_speed.sh 1.1 data/train_nodup data/temp3
   utils/combine_data.sh data/train_nodup_perturbed data/temp1 data/temp2 data/temp3
   rm -r data/temp1 data/temp2 data/temp3
- 
+
   mfccdir=mfcc_perturbed
   for x in train_nodup_perturbed; do
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
@@ -167,7 +167,7 @@ if [ $stage -le 8 ]; then
   # Because we have a lot of data here and we don't want the training to take
   # too long so we reduce the number of epochs from the defaults (15 + 5) to (5
   # + 2), and the (initial,final) learning rate from the defaults (0.04, 0.004)
-  # to (0.01, 0.001). 
+  # to (0.01, 0.001).
   # decided to let others run their jobs too (we only have 10 GPUs on our queue
   # at JHU).  The number of parameters is smaller than the baseline system we had in
   # mind (../nnet2/run_5d_gpu.sh, which had pnorm input/output dim 3000/300 and
@@ -226,7 +226,7 @@ if [ $stage -le 11 ]; then
 fi
 
 if [ $stage -le 12 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for lm_suffix in tg fsh_tgpr; do
     graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
diff --git a/egs/swbd/s5c/local/online/run_nnet2_ms.sh b/egs/swbd/s5c/local/online/run_nnet2_ms.sh
index cb81d8dcbc3..2525aa85739 100755
--- a/egs/swbd/s5c/local/online/run_nnet2_ms.sh
+++ b/egs/swbd/s5c/local/online/run_nnet2_ms.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -16,13 +16,13 @@ has_fisher=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -31,7 +31,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -100,7 +100,7 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   graph_dir=exp/tri4/graph_sw1_tg
   for data in eval2000_hires train_hires_dev; do
diff --git a/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh b/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh
index 9038c5fd115..5a120d1e00d 100755
--- a/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh
+++ b/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh
@@ -7,7 +7,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -23,13 +23,13 @@ has_fisher=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -38,7 +38,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -173,7 +173,7 @@ if [ $stage -le 14 ]; then
 fi
 
 if [ $stage -le 15 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   graph_dir=exp/tri4/graph_sw1_tg
   for data in eval2000_hires train_hires_dev; do
diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh
index ba7f120e599..4e0263d7cca 100644
--- a/egs/tedlium/s5/cmd.sh
+++ b/egs/tedlium/s5/cmd.sh
@@ -11,9 +11,9 @@
 #export cuda_cmd=run.pl
 
 # JHU cluster:
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* --mem 4G"
-export cuda_cmd="queue.pl -l arch=*64* --gpu 1"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export cuda_cmd="queue.pl --gpu 1"
 
 host=$(hostname -f)
 if [ ${host#*.} == "fit.vutbr.cz" ]; then
@@ -23,10 +23,10 @@ if [ ${host#*.} == "fit.vutbr.cz" ]; then
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 elif [ ${host#*.} == "cm.cluster" ]; then
   # MARCC bluecrab cluster:
   export train_cmd="slurm.pl --time 4:00:00 "
   export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
-  export cuda_cmd="slurm.pl --gpu 1" 
+  export cuda_cmd="slurm.pl --gpu 1"
 fi
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh b/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh
index d558415de64..3d58cd8f8dc 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1 --config conf/no_k20.conf --allow-k20 false"
+  parallel_opts="--gpu 1 --config conf/no_k20.conf --allow-k20 false"
 #that config is like the default config in the text of queue.pl, but adding the following lines.
 #default allow_k20=true
 #option allow_k20=true
@@ -40,7 +40,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -104,7 +104,7 @@ fi
 wait;
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev test; do
     num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ms.sh b/egs/tedlium/s5/local/online/run_nnet2_ms.sh
index cb859bad011..9828966dabf 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ms.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ms.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1 --config conf/no_k20.conf --allow-k20 false"
+  parallel_opts="--gpu 1 --config conf/no_k20.conf --allow-k20 false"
 #that config is like the default config in the text of queue.pl, but adding the following lines.
 #default allow_k20=true
 #option allow_k20=true
@@ -40,7 +40,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -106,7 +106,7 @@ fi
 wait;
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev test; do
     num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh b/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
index 4505b89a273..1ab37ec4918 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -40,19 +40,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -67,13 +67,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # max total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_hires \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_hires data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh b/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh
index 7c0cbd6aed6..467d84389a5 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh
@@ -8,7 +8,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=0
 train_stage=-10
 use_gpu=true
@@ -22,13 +22,13 @@ has_fisher=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -37,7 +37,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -150,7 +150,7 @@ fi
 wait;
 
 if [ $stage -le 14 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev test; do
     num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
@@ -193,7 +193,7 @@ if [ $stage -le 17 ]; then
   cat <<EOF >${dir}_online/sample_decode.sh
 . cmd.sh
 data_dir=\$1  # e.g. data/dev_hires (to be prepared by the user, see egs/tedlium/run.sh for examples)
-model_dir=\$2 # e.g. exp/nnet2_online/nnet_ms_sp_online (provided in the distribution) 
+model_dir=\$2 # e.g. exp/nnet2_online/nnet_ms_sp_online (provided in the distribution)
 
 decode_dir=\$model_dir/\`basename \$data_dir\`
 num_jobs=\`cat \$data_dir/spk2utt | wc -l\`
diff --git a/egs/wsj/s5/local/nnet2/run_5b_gpu.sh b/egs/wsj/s5/local/nnet2/run_5b_gpu.sh
index 2dc5afa0e87..c1faf3e5d4f 100755
--- a/egs/wsj/s5/local/nnet2/run_5b_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_5b_gpu.sh
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
 
   steps/nnet2/train_block.sh --stage "$train_stage" \
      --num-threads 1 --max-change 40.0 --minibatch-size 512 --num-jobs-nnet 8 \
-     --parallel-opts "-l gpu=1" \
+     --parallel-opts "--gpu 1" \
      --initial-learning-rate 0.0075 --final-learning-rate 0.00075 \
      --num-epochs 10 --num-epochs-extra 5 \
      --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/nnet2/run_5c.sh b/egs/wsj/s5/local/nnet2/run_5c.sh
index e33546572ad..e8df3e8a2e9 100755
--- a/egs/wsj/s5/local/nnet2/run_5c.sh
+++ b/egs/wsj/s5/local/nnet2/run_5c.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+#
 
 train_stage=-10
 use_gpu=true
@@ -13,19 +13,19 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet5c_gpu
 else
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet5c
   minibatch_size=128
 fi
@@ -48,7 +48,7 @@ if [ ! -f $dir/final.mdl ]; then
     --cmd "$decode_cmd" \
      data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
 fi
-  
+
 
 steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
   --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
diff --git a/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh b/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh
index 83096b0c38c..3d4b070ceda 100755
--- a/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh
@@ -9,7 +9,7 @@
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5c2_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
diff --git a/egs/wsj/s5/local/nnet2/run_5d.sh b/egs/wsj/s5/local/nnet2/run_5d.sh
index 0206fba2159..050ac29f5a3 100755
--- a/egs/wsj/s5/local/nnet2/run_5d.sh
+++ b/egs/wsj/s5/local/nnet2/run_5d.sh
@@ -13,13 +13,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet5d_gpu
@@ -27,7 +27,7 @@ else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp/nnet5d
 fi
diff --git a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
index bae7327788e..b727acb905e 100755
--- a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
@@ -8,7 +8,7 @@ final_beta=5
 
 train_stage=-10
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5e_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
diff --git a/egs/wsj/s5/local/nnet2/run_6c_gpu.sh b/egs/wsj/s5/local/nnet2/run_6c_gpu.sh
index ef3ca4d3e0b..ddd2d3e2a86 100755
--- a/egs/wsj/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_6c_gpu.sh
@@ -7,21 +7,21 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1"                   # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"                   # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -31,8 +31,8 @@ set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5c_gpu exp/nnet5c_gpu_denlats
 
diff --git a/egs/wsj/s5/local/nnet2/run_6d.sh b/egs/wsj/s5/local/nnet2/run_6d.sh
index 609bd4aa5bd..c208404e7e5 100755
--- a/egs/wsj/s5/local/nnet2/run_6d.sh
+++ b/egs/wsj/s5/local/nnet2/run_6d.sh
@@ -2,15 +2,15 @@
 
 
 # This script demonstrates discriminative training of p-norm neural nets.  It's on top
-# of run_5d_gpu.sh, which uses adapted 40-dimensional features. 
+# of run_5d_gpu.sh, which uses adapted 40-dimensional features.
 
 
 set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5d exp/nnet5d_denlats
 
diff --git a/egs/wsj/s5/local/nnet2/run_6d_gpu.sh b/egs/wsj/s5/local/nnet2/run_6d_gpu.sh
index 199597dab81..3ee2ecb53a3 100755
--- a/egs/wsj/s5/local/nnet2/run_6d_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_6d_gpu.sh
@@ -7,14 +7,14 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1"                # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"                # This is suitable for the CLSP network,
                                    # you'll likely have to change it.  we'll
                                    # use it later on, in the training (it's
                                    # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -25,8 +25,8 @@ set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5d_gpu exp/nnet5d_gpu_denlats
 
diff --git a/egs/wsj/s5/local/nnet2/run_bnf.sh b/egs/wsj/s5/local/nnet2/run_bnf.sh
index 1c00267c480..245bb1c0bcf 100644
--- a/egs/wsj/s5/local/nnet2/run_bnf.sh
+++ b/egs/wsj/s5/local/nnet2/run_bnf.sh
@@ -12,25 +12,25 @@ set -u
 use_gpu=true
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
 fi
 
 bnf_train_stage=-100
-align_dir=exp/tri4b_ali_si284 
+align_dir=exp/tri4b_ali_si284
 if [ ! -f exp_bnf/tri6_bnf/.done ]; then
   mkdir -p exp_bnf
   mkdir -p exp_bnf/tri6_bnf
@@ -41,12 +41,12 @@ if [ ! -f exp_bnf/tri6_bnf/.done ]; then
     --stage $bnf_train_stage --num-jobs-nnet 4 \
     --num-threads $num_threads --mix-up 5000 --max-change 40 \
     --minibatch-size $minibatch_size --parallel-opts "$parallel_opts" \
-    --cmd "$train_cmd -l mem_free=2G,ram_free=2G"\
+    --cmd "$train_cmd --mem 2G"\
     --initial-learning-rate 0.005 \
     --final-learning-rate 0.0005 \
     --num-hidden-layers 5 \
     --bottleneck-dim 42 --hidden-layer-dim 1024 \
-    data/train_si284 data/lang $align_dir exp_bnf/tri6_bnf || exit 1 
+    data/train_si284 data/lang $align_dir exp_bnf/tri6_bnf || exit 1
   touch exp_bnf/tri6_bnf/.done
 fi
 
@@ -57,7 +57,7 @@ if [ ! -f data_bnf/train_bnf/.done ]; then
   steps/nnet2/dump_bottleneck_features.sh --cmd "$train_cmd" \
     --transform-dir exp/tri4a  data/train_si284 data_bnf/train_bnf exp_bnf/tri6_bnf param_bnf exp_bnf/dump_bnf
   touch data_bnf/train_bnf/.done
-fi 
+fi
 
 [ ! -d data/test_eval92 ] && echo "No such directory data/test_eval92" && exit 1;
 [ ! -d data/test_dev93 ] && echo "No such directory data/test_dev93" && exit 1;
@@ -75,11 +75,11 @@ steps/nnet2/dump_bottleneck_features.sh --nj 10 \
 if [ ! data_bnf/train/.done -nt data_bnf/train_bnf/.done ]; then
   steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd --max-jobs-run 10" \
      --transform-dir $align_dir  data_bnf/train_sat data/train_si284 \
-    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/ 
+    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/
 
   steps/append_feats.sh --cmd "$train_cmd" --nj 4 \
     data_bnf/train_bnf data_bnf/train_sat data_bnf/train \
-    exp_bnf/append_feats/log param_bnf/ 
+    exp_bnf/append_feats/log param_bnf/
   steps/compute_cmvn_stats.sh --fake data_bnf/train exp_bnf/make_fmllr_feats param_bnf
   rm -r data_bnf/train_sat
 
@@ -88,18 +88,18 @@ fi
 ## preparing Bottleneck features for eval92 and dev93
 steps/nnet/make_fmllr_feats.sh \
   --nj 8 --transform-dir exp/tri4b/decode_bd_tgpr_eval92 data_bnf/eval92_sat data/test_eval92 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 steps/nnet/make_fmllr_feats.sh \
   --nj 10 --transform-dir exp/tri4b/decode_bd_tgpr_dev93 data_bnf/dev93_sat data/test_dev93 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 
 steps/append_feats.sh --nj 4 \
   data_bnf/eval92_bnf data_bnf/eval92_sat data_bnf/eval92 \
-  exp_bnf/append_feats/log param_bnf/ 
+  exp_bnf/append_feats/log param_bnf/
 steps/append_feats.sh --nj 4 \
   data_bnf/dev93_bnf data_bnf/dev93_sat data_bnf/dev93 \
-  exp_bnf/append_feats/log param_bnf/ 
-  
+  exp_bnf/append_feats/log param_bnf/
+
 steps/compute_cmvn_stats.sh --fake data_bnf/eval92 exp_bnf/make_fmllr_feats param_bnf
 steps/compute_cmvn_stats.sh --fake data_bnf/dev93 exp_bnf/make_fmllr_feats param_bnf
 
diff --git a/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh b/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
index e9d573f01a3..861e993774b 100755
--- a/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
+++ b/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
@@ -13,13 +13,13 @@ set -u
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp_bnf/tri6_bnf_gpu
@@ -27,13 +27,13 @@ else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp_bnf/tri6_bnf
 fi
 
 
-align_dir=exp/tri4b_ali_si284 
+align_dir=exp/tri4b_ali_si284
 mkdir -p exp_bnf
 mkdir -p $dir
 
@@ -52,7 +52,7 @@ if [ ! -f $dir/.done ]; then
     --num-jobs-nnet 8 --mix-up 8000 \
     --num-hidden-layers 4 \
     --bottleneck-dim 42 --cmd "$train_cmd" \
-    data/train_si284 data/lang $align_dir $dir || exit 1 
+    data/train_si284 data/lang $align_dir $dir || exit 1
   touch $dir/.done
 fi
 
@@ -63,7 +63,7 @@ if [ ! -f data_bnf/train_bnf/.done ]; then
   steps/nnet2/dump_bottleneck_features.sh --cmd "$train_cmd" \
     --transform-dir exp/tri4a  data/train_si284 data_bnf/train_bnf $dir param_bnf exp_bnf/dump_bnf
   touch data_bnf/train_bnf/.done
-fi 
+fi
 
 [ ! -d data/test_eval92 ] && echo "No such directory data/test_eval92" && exit 1;
 [ ! -d data/test_dev93 ] && echo "No such directory data/test_dev93" && exit 1;
@@ -81,11 +81,11 @@ steps/nnet2/dump_bottleneck_features.sh --nj 10 \
 if [ ! data_bnf/train/.done -nt data_bnf/train_bnf/.done ]; then
   steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd --max-jobs-run 10" \
      --transform-dir $align_dir  data_bnf/train_sat data/train_si284 \
-    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/ 
+    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/
 
   steps/append_feats.sh --cmd "$train_cmd" --nj 4 \
     data_bnf/train_bnf data_bnf/train_sat data_bnf/train \
-    exp_bnf/append_feats/log param_bnf/ 
+    exp_bnf/append_feats/log param_bnf/
   steps/compute_cmvn_stats.sh --fake data_bnf/train exp_bnf/make_fmllr_feats param_bnf
   rm -r data_bnf/train_sat
 
@@ -94,18 +94,18 @@ fi
 ## preparing Bottleneck features for eval92 and dev93
 steps/nnet/make_fmllr_feats.sh \
   --nj 8 --transform-dir exp/tri4b/decode_bd_tgpr_eval92 data_bnf/eval92_sat data/test_eval92 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 steps/nnet/make_fmllr_feats.sh \
   --nj 10 --transform-dir exp/tri4b/decode_bd_tgpr_dev93 data_bnf/dev93_sat data/test_dev93 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 
 steps/append_feats.sh --nj 4 \
   data_bnf/eval92_bnf data_bnf/eval92_sat data_bnf/eval92 \
-  exp_bnf/append_feats/log param_bnf/ 
+  exp_bnf/append_feats/log param_bnf/
 steps/append_feats.sh --nj 4 \
   data_bnf/dev93_bnf data_bnf/dev93_sat data_bnf/dev93 \
-  exp_bnf/append_feats/log param_bnf/ 
-  
+  exp_bnf/append_feats/log param_bnf/
+
 steps/compute_cmvn_stats.sh --fake data_bnf/eval92 exp_bnf/make_fmllr_feats param_bnf
 steps/compute_cmvn_stats.sh --fake data_bnf/dev93 exp_bnf/make_fmllr_feats param_bnf
 
diff --git a/egs/wsj/s5/local/online/run_nnet2_baseline.sh b/egs/wsj/s5/local/online/run_nnet2_baseline.sh
index 17d0face7e3..d1bd13bf7ae 100755
--- a/egs/wsj/s5/local/online/run_nnet2_baseline.sh
+++ b/egs/wsj/s5/local/online/run_nnet2_baseline.sh
@@ -12,13 +12,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -28,7 +28,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_a_baseline
 fi
 
diff --git a/egs/wsj/s5/local/online/run_nnet2_discriminative.sh b/egs/wsj/s5/local/online/run_nnet2_discriminative.sh
index a92e9c3367b..1504375ec97 100755
--- a/egs/wsj/s5/local/online/run_nnet2_discriminative.sh
+++ b/egs/wsj/s5/local/online/run_nnet2_discriminative.sh
@@ -17,14 +17,14 @@ srcdir=exp/nnet2_online/nnet_ms_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  gpu_opts="-l gpu=1"
-  train_parallel_opts="-l gpu=1"
+  gpu_opts="--gpu 1"
+  train_parallel_opts="--gpu 1"
   num_threads=1
   # the _a is in case I want to change the parameters.
 else
@@ -32,19 +32,19 @@ else
   # almost the same, but this may be a little bit slow.
   gpu_opts=""
   num_threads=16
-  train_parallel_opts="-pe smp 16"
+  train_parallel_opts="--num-threads 16"
 fi
 
 nj=40
 
 if [ $stage -le 1 ]; then
- 
+
   # the make_denlats job is always done on CPU not GPU, since in any case
   # the graph search and lattice determinization takes quite a bit of CPU.
   # note: it's the sub-split option that determinies how many jobs actually
   # run at one time.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "--num-threads 6" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
       data/train_si284_hires data/lang $srcdir ${srcdir}_denlats
 fi
@@ -77,9 +77,9 @@ if [ $stage -le 4 ]; then
   rm $error_file 2>/dev/null || true
 
   for epoch in 1 2 3 4; do
-    # do the actual online decoding with iVectors, carrying info forward from 
+    # do the actual online decoding with iVectors, carrying info forward from
     # previous utterances of the same speaker.
-    # We just do the bd_tgpr decodes; otherwise the number of combinations 
+    # We just do the bd_tgpr decodes; otherwise the number of combinations
     # starts to get very large.
     for lm_suffix in bd_tgpr; do
       graph_dir=exp/tri4b/graph_${lm_suffix}
diff --git a/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh b/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh
index 1a69e50f3ea..5dd14a435bb 100755
--- a/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh
+++ b/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh
@@ -21,7 +21,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
diff --git a/egs/wsj/s5/local/run_bnf_sgmm.sh b/egs/wsj/s5/local/run_bnf_sgmm.sh
index 6cfe1df67ed..8e2aadf214b 100644
--- a/egs/wsj/s5/local/run_bnf_sgmm.sh
+++ b/egs/wsj/s5/local/run_bnf_sgmm.sh
@@ -4,11 +4,11 @@
 
 . ./cmd.sh
 
-set -e 
+set -e
 set -o pipefail
 set -u
 
-# Set my_nj; typically 64.   
+# Set my_nj; typically 64.
 numLeaves=2500
 numGauss=15000
 numLeavesSGMM=10000
@@ -16,7 +16,7 @@ bnf_num_gauss_ubm=600
 bnf_num_gauss_sgmm=7000
 align_dir=exp/tri4b_ali_si284
 bnf_decode_acwt=0.0357
-sgmm_group_extra_opts=(--group 3 --cmd "queue.pl -l arch=*64 --mem 7G")
+sgmm_group_extra_opts=(--group 3 --cmd "queue.pl --mem 7G")
 
 if [ ! -d exp_bnf ]; then
   echo "$0: before running this script, please run local/run_bnf.sh"
@@ -64,7 +64,7 @@ echo "Starting exp_bnf/ubm7 on" `date`
 echo ---------------------------------------------------------------------
 if [ ! exp_bnf/ubm7/.done -nt exp_bnf/tri6/.done ]; then
   steps/train_ubm.sh \
-    $bnf_num_gauss_ubm data_bnf/train data/lang exp_bnf/tri6 exp_bnf/ubm7 
+    $bnf_num_gauss_ubm data_bnf/train data/lang exp_bnf/tri6 exp_bnf/ubm7
   touch exp_bnf/ubm7/.done
 fi
 
@@ -75,11 +75,11 @@ if [ ! exp_bnf/sgmm7/.done -nt exp_bnf/ubm7/.done ]; then
   steps/train_sgmm2_group.sh \
     "${sgmm_group_extra_opts[@]}"\
     $numLeavesSGMM $bnf_num_gauss_sgmm data_bnf/train data/lang \
-    exp_bnf/tri6 exp_bnf/ubm7/final.ubm exp_bnf/sgmm7 
+    exp_bnf/tri6 exp_bnf/ubm7/final.ubm exp_bnf/sgmm7
   touch exp_bnf/sgmm7/.done
 fi
 
-## SGMM2 decoding 
+## SGMM2 decoding
 decode1=exp_bnf/sgmm7/decode_bd_tgpr_eval92
 decode2=exp_bnf/sgmm7/decode_bd_tgpr_dev93
   echo ---------------------------------------------------------------------
@@ -104,7 +104,7 @@ if [ ! exp_bnf/sgmm7_ali/.done -nt exp_bnf/sgmm7/.done ]; then
   echo ---------------------------------------------------------------------
   steps/align_sgmm2.sh \
     --transform-dir exp_bnf/tri6 --nj 30 --use-graphs true \
-    data_bnf/train data/lang exp_bnf/sgmm7 exp_bnf/sgmm7_ali 
+    data_bnf/train data/lang exp_bnf/sgmm7 exp_bnf/sgmm7_ali
   touch exp_bnf/sgmm7_ali/.done
 fi
 
@@ -115,7 +115,7 @@ if [ ! exp_bnf/sgmm7_denlats/.done -nt exp_bnf/sgmm7/.done ]; then
   steps/make_denlats_sgmm2.sh \
      "${sgmm_denlats_extra_opts[@]}" \
     --transform-dir exp_bnf/tri6 --nj 30 --beam 14.0 --acwt $bnf_decode_acwt --lattice-beam 8 \
-     data_bnf/train data/lang exp_bnf/sgmm7_ali exp_bnf/sgmm7_denlats 
+     data_bnf/train data/lang exp_bnf/sgmm7_ali exp_bnf/sgmm7_denlats
   touch exp_bnf/sgmm7_denlats/.done
 fi
 
@@ -124,7 +124,7 @@ if [ ! exp_bnf/sgmm7_mmi_b0.1/.done -nt exp_bnf/sgmm7_denlats/.done ]; then
     --acwt $bnf_decode_acwt \
     --transform-dir exp_bnf/tri6 --boost 0.1 --drop-frames true \
     data_bnf/train data/lang exp_bnf/sgmm7_ali exp_bnf/sgmm7_denlats \
-    exp_bnf/sgmm7_mmi_b0.1 
+    exp_bnf/sgmm7_mmi_b0.1
   touch exp_bnf/sgmm7_mmi_b0.1/.done;
 fi
 
@@ -140,7 +140,7 @@ done
 
 for iter in 1 2 3 4; do
   # Decode SGMM+MMI (via rescoring).
-  decode2=exp_bnf/sgmm7_mmi_b0.1/decode_bd_tgpr_dev93_it$iter  
+  decode2=exp_bnf/sgmm7_mmi_b0.1/decode_bd_tgpr_dev93_it$iter
   mkdir -p $decode2
   steps/decode_sgmm2_rescore.sh  --skip-scoring false --cmd "$decode_cmd" \
     --iter $iter --transform-dir exp_bnf/tri6/decode_bd_tgpr_dev93 --scoring-opts "--min-lmwt 20 --max-lmwt 40" \
diff --git a/egs/wsj/s5/steps/nnet2/get_egs2.sh b/egs/wsj/s5/steps/nnet2/get_egs2.sh
index 69e92ef4b6f..1cd344ca686 100755
--- a/egs/wsj/s5/steps/nnet2/get_egs2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs2.sh
@@ -50,7 +50,7 @@ transform_dir=     # If supplied, overrides alidir as the place to find fMLLR tr
 postdir=        # If supplied, we will use posteriors in it as soft training targets.
 
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 online_ivector_dir=  # can be used if we are including speaker information as iVectors.
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
diff --git a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
index 5fee71e80cd..03a64e222a8 100755
--- a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
@@ -43,7 +43,7 @@ if [ $# != 6 ]; then
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
   echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
   echo "                                                   # process."
diff --git a/egs/wsj/s5/steps/nnet2/retrain_fast.sh b/egs/wsj/s5/steps/nnet2/retrain_fast.sh
index 68ecdf33946..8c82c361d82 100755
--- a/egs/wsj/s5/steps/nnet2/retrain_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/retrain_fast.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2014  Johns Hopkins University (Author: Daniel Povey).
 # Apache 2.0.
 
 # retrain_fast.sh is a neural net training script that's intended to train
@@ -24,7 +24,7 @@ final_learning_rate=0.004
 
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
 num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
@@ -42,7 +42,7 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
 
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 
 alpha=4.0   # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -102,7 +102,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -115,7 +115,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -169,7 +169,7 @@ if [ $stage -le -2 ]; then
   echo "$0: initializing neural net";
 
   feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
-  
+
   online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
 
   cat >$dir/nnet.config <<EOF
@@ -251,7 +251,7 @@ while [ $x -lt $num_iters ]; do
           ark:$egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
 
     if [ $x -eq 0 ]; then
@@ -273,7 +273,7 @@ while [ $x -lt $num_iters ]; do
       perturb_suffix="-perturbed"
       perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
     fi
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -297,7 +297,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
diff --git a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
index d0a51110ac8..73cfb3d2d49 100755
--- a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
+++ b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
@@ -22,7 +22,7 @@ initial_learning_rate=0.04
 final_learning_rate=0.004
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
 num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  This option
@@ -59,7 +59,7 @@ max_change_per_sample=0.075
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -67,7 +67,7 @@ combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage
 cleanup=true
 egs_dir=
 egs_opts=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
 realign_epochs=         # List of epochs, the beginning of which realignment is done
@@ -103,7 +103,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -116,7 +116,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -258,7 +258,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -268,7 +268,7 @@ while [ $x -lt $num_iters ]; do
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       epoch=${realign_this_iter[$x]}
 
-             
+
 
       echo "Getting average posterior for purposes of adjusting the priors."
       # Note: this just uses CPUs, using a smallish subset of data.
@@ -304,7 +304,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -341,7 +341,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -383,7 +383,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -420,7 +420,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_block.sh b/egs/wsj/s5/steps/nnet2/train_block.sh
index ec8ed7a3856..1e79bb76473 100755
--- a/egs/wsj/s5/steps/nnet2/train_block.sh
+++ b/egs/wsj/s5/steps/nnet2/train_block.sh
@@ -16,7 +16,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.0
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -53,7 +53,7 @@ block_shift=5
 
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 splice_width=7 # meaning +- 7 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -106,7 +106,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -123,7 +123,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -153,7 +153,7 @@ cp $alidir/tree $dir
 utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
 cp $lang/phones.txt $dir || exit 1;
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
 awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
@@ -200,7 +200,7 @@ if [ $stage -le -2 ]; then
   first_hidden_layer_stddev=`perl -e "print 1.0/sqrt($hidden_block_dim);"`
   stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
 
-  
+
   cat >$dir/nnet.config <<EOF
 SpliceComponent input-dim=$feat_dim left-context=$splice_width right-context=$splice_width
 FixedAffineComponent matrix=$dir/lda.mat
@@ -272,10 +272,10 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     mdl=$dir/$x.mdl
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -293,16 +293,16 @@ while [ $x -lt $num_iters ]; do
     softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -317,7 +317,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -380,7 +380,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
index dfa10957e0f..9f3e9234389 100755
--- a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
@@ -36,10 +36,10 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-hidden_dim=3000 
+hidden_dim=3000
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -91,26 +91,26 @@ patch_step2=1         # patch step of the second convolutional layer
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
-combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
 cleanup=true
 egs_dir=
 lda_opts=
 lda_dim=
 egs_opts=
 delta_order=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 transform_dir=     # If supplied, overrides alidir
 postdir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -143,10 +143,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
   echo "                                                   # as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -170,7 +170,7 @@ if [ $# != 4 ]; then
   echo "  --num-filters2 <num-filters2|256>                # number of filters in the second convolutional layer."
   echo "  --patch-dim2 <patch-dim2|4>                      # dim of convolutional kernel in the second layer."
 
-  
+
   exit 1;
 fi
 
@@ -225,7 +225,7 @@ feat-to-dim scp:$sdata/1/feats.scp - > $dir/feat_dim
 feat_dim=$(cat $dir/feat_dim) || exit 1;
 
 if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
-  echo "$0: calling get_egs2.sh"            
+  echo "$0: calling get_egs2.sh"
   steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
     --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \
     --cmd "$cmd" --feat-type "raw" $data $alidir $dir/egs || exit 1;
@@ -273,7 +273,7 @@ if [ $stage -le -2 ]; then
   conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1)
   pool_out_dim=$[$num_filters1*$num_pool]
   conv_out_dim2=$[$num_filters2*$num_patch2]
-  
+
   online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
 
   initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
@@ -286,7 +286,7 @@ NormalizeComponent dim=$pool_out_dim
 AffineComponentPreconditionedOnline input-dim=$pool_out_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
 SoftmaxComponent dim=$num_leaves
 EOF
-  
+
   cat >$dir/replace.1.config <<EOF
 Convolutional1dComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2 appended-conv=true
 NormalizeComponent dim=$conv_out_dim2
@@ -303,7 +303,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/replace.3.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
 RectifiedLinearComponent dim=$hidden_dim
@@ -409,7 +409,7 @@ while [ $x -lt $num_iters ]; do
 
   # TODO: remove this line.
   echo "On iteration $x, learning rate is $this_learning_rate."
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -419,7 +419,7 @@ while [ $x -lt $num_iters ]; do
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       time=${realign_this_iter[$x]}
 
-             
+
 
       echo "Getting average posterior for purposes of adjusting the priors."
       # Note: this just uses CPUs, using a smallish subset of data.
@@ -455,7 +455,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -463,7 +463,7 @@ while [ $x -lt $num_iters ]; do
     $cmd $dir/log/compute_prob_train.$x.log \
       nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
     if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
-      [ ! -f $x.mdl ] && sleep 10; 
+      [ ! -f $x.mdl ] && sleep 10;
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
         ark:$cur_egs_dir/train_diagnostic.egs '&&' \
@@ -499,7 +499,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -538,7 +538,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
@@ -575,7 +575,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh b/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
index 82156fed39f..27d1313446d 100755
--- a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
@@ -67,7 +67,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -136,7 +136,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
index a5cef8aea44..247d452e714 100755
--- a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -21,11 +21,11 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -62,7 +62,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -74,7 +74,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -86,12 +86,12 @@ lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
 postdir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -131,7 +131,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -149,7 +149,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -369,7 +369,7 @@ while [ $x -lt $num_iters ]; do
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -414,7 +414,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -463,7 +463,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -483,7 +483,7 @@ while [ $x -lt $num_iters ]; do
         nnets_ensemble_out="${nnets_ensemble_out} $dir/$[$x+1].$n.$i.mdl "
       done
 
-      beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`; 
+      beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`;
 
 
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
@@ -499,12 +499,12 @@ while [ $x -lt $num_iters ]; do
     # have printed a more specific one.
     [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
 
-    for i in `seq 1 $ensemble_size`; do 
+    for i in `seq 1 $ensemble_size`; do
       nnets_list=
       for n in `seq 1 $this_num_jobs`; do
         nnets_list="$nnets_list $dir/$[$x+1].$n.$i.mdl"
       done
-    
+
       if $do_average; then
         # average the output of the different jobs.
         $cmd $dir/log/average.$x.log \
@@ -514,12 +514,12 @@ while [ $x -lt $num_iters ]; do
         n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
             $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
             undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-            close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+            close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
             $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
         [ -z "$n" ] && echo "Error getting best model" && exit 1;
         cp $dir/$[$x+1].$n.$i.mdl $dir/$[$x+1].$i.mdl || exit 1;
       fi
-  
+
       if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
         # mix up.
         echo Mixing up from $num_leaves to $mix_up components
@@ -545,7 +545,7 @@ if [ $stage -le $num_iters ]; then
 
 (
   # Now do combination.
-  for i in `seq 1 $ensemble_size`; do 
+  for i in `seq 1 $ensemble_size`; do
     # Now do combination.
     nnets_list=()
     # the if..else..fi statement below sets 'nnets_list'.
@@ -555,7 +555,7 @@ if [ $stage -le $num_iters ]; then
       cur_offset=0 # current offset from first_model_combine.
       for n in $(seq $max_models_combine); do
         next_offset=$[($n*$num_models_combine)/$max_models_combine]
-        sub_list="" 
+        sub_list=""
         for o in $(seq $cur_offset $[$next_offset-1]); do
           iter=$[$first_model_combine+$o]
           mdl=$dir/$iter.$i.mdl
@@ -586,13 +586,13 @@ if [ $stage -le $num_iters ]; then
     # nnet-combine-fast uses for scaling, which after flooring and inversion, has
     # the effect that the initial model chosen gets much higher learning rates
     # than the others.  This prevents the optimization from working well.
-    
+
     $cmd $combine_parallel_opts  $dir/log/combine.$i.log \
     nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
       --num-threads=$combine_num_threads \
       --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
       $dir/final.$i.mdl || touch $dir/.error &
-    
+
     [ -f $dir/.error ] && echo "$0: error when combining models." && exit 1;
     rm $dir/.error 2>/dev/null
   done
@@ -602,7 +602,7 @@ if [ $stage -le $num_iters ]; then
   # pnorm layer and then a normalize layer.
   $cmd JOB=1:$ensemble_size $dir/log/normalize.JOB.log \
     nnet-normalize-stddev $dir/final.JOB.mdl $dir/final.JOB.mdl || exit 1;
- 
+
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
@@ -613,7 +613,7 @@ if [ $stage -le $num_iters ]; then
 fi
 
 if [ $stage -le $[$num_iters+1] ]; then
-  for i in `seq 1 $ensemble_size`; do 
+  for i in `seq 1 $ensemble_size`; do
     echo "Getting average posterior for purposes of adjusting the priors."
     # Note: this just uses CPUs, using a smallish subset of data.
     rm $dir/post.$x.*.vec 2>/dev/null
@@ -622,14 +622,14 @@ if [ $stage -le $[$num_iters+1] ]; then
       nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
       nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.$i.mdl -|" ark:- ark:- \| \
       matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
-  
+
     sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
-  
+
     $cmd $dir/log/vector_sum.$x.log \
      vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
-  
+
     rm $dir/post.$x.*.vec;
-  
+
     echo "Re-adjusting priors based on computed posteriors"
     $cmd $dir/log/adjust_priors.final.log \
       nnet-adjust-priors $dir/final.$i.mdl $dir/post.$x.vec $dir/final.$i.mdl || exit 1;
@@ -656,7 +656,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    for i in `seq 1 $ensemble_size`; do 
+    for i in `seq 1 $ensemble_size`; do
       if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.$i.mdl ]; then
        # delete all but every 100th model; don't delete the ones which combine to form the final model.
         rm $dir/$x.$i.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm.sh b/egs/wsj/s5/steps/nnet2/train_pnorm.sh
index 93ea7970e22..1e47d84f155 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm.sh
@@ -1,19 +1,19 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
 # Apache 2.0.
 
 
-# This script trains neural network with pnorm nonlinearities. 
-# The difference with train_tanh.sh is that, instead of setting 
+# This script trains neural network with pnorm nonlinearities.
+# The difference with train_tanh.sh is that, instead of setting
 # hidden_layer_size, you should set pnorm_input_dim and pnorm_output_dim.
 # Also the P value (the order of the p-norm) should be set.
-# 
+#
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -30,7 +30,7 @@ bias_stddev=0.5
 softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.
 
 combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
@@ -56,7 +56,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -72,7 +72,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -118,7 +118,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -137,7 +137,7 @@ if [ $# != 4 ]; then
   echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -233,7 +233,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -286,7 +286,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -328,7 +328,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -341,7 +341,7 @@ while [ $x -lt $num_iters ]; do
         ark:$cur_egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
@@ -369,16 +369,16 @@ while [ $x -lt $num_iters ]; do
     softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -479,7 +479,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 100th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
index 057a29af306..1b2cac6b441 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
@@ -87,7 +87,7 @@ egs_dir=
 lda_opts=
 lda_dim=
 egs_opts=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 transform_dir=     # If supplied, overrides alidir
 postdir=
 cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
@@ -131,7 +131,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh
index 338acee355c..2ad328f06aa 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2014  Pegah Ghahremani
 # Apache 2.0.
 
@@ -25,13 +25,13 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 bottleneck_dim=42  # bottleneck layer dimensio
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -53,7 +53,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
@@ -80,13 +80,13 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                         # more than enough.
 bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer
-                                          # eg. 2000|2000|420|2000 bottleneck_layer_num = 2    
+                                          # eg. 2000|2000|420|2000 bottleneck_layer_num = 2
 # End configuration section.
 
 
@@ -124,7 +124,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -139,7 +139,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -234,7 +234,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -253,7 +253,7 @@ PnormComponent input-dim=$bnf_input_dim output-dim=$bnf_output_dim p=$p
 NormalizeComponent dim=$bnf_output_dim
 AffineComponentPreconditionedOnline input-dim=$bnf_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim  p=$p
-NormalizeComponent dim=$pnorm_output_dim 
+NormalizeComponent dim=$pnorm_output_dim
 EOF
   $cmd $dir/log/nnet_init.log \
     nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
@@ -333,16 +333,16 @@ while [ $x -lt $num_iters ]; do
           ark:$egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
 
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
-      if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then 
-        echo bnf layer with x = $x 
+      if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then
+        echo bnf layer with x = $x
         mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"
-      else 
+      else
         mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
       fi
     else
@@ -368,7 +368,7 @@ while [ $x -lt $num_iters ]; do
       perturb_suffix="-perturbed"
       perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
     fi
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -392,7 +392,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -500,6 +500,6 @@ fi
 name=`basename $data`
 if [ -f $dir/final.mdl ]; then
   nnet-to-raw-nnet --truncate=$truncate_comp_num $dir/final.mdl $dir/final.raw
-else 
+else
   echo "$0: we require final.mdl in source dir $dir"
 fi
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh
index 69ab4596f29..cdb63aa7863 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh
@@ -1,18 +1,18 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Guoguo Chen
 #           2014  Xiaohui Zhang
 # Apache 2.0.
 
 
-# This script trains an ensemble of neural networks with pnorm nonlinearities. 
-# An ensemble of nets are first differently initialized, and then trained using the 
-# same data during each iteration. In each training iteration, one term is added to 
-# the objf, which is beta times the cross-entropy between the current net's posterior 
-# output and the geometrically averaged posterior outputs of the ensemble of nets. 
+# This script trains an ensemble of neural networks with pnorm nonlinearities.
+# An ensemble of nets are first differently initialized, and then trained using the
+# same data during each iteration. In each training iteration, one term is added to
+# the objf, which is beta times the cross-entropy between the current net's posterior
+# output and the geometrically averaged posterior outputs of the ensemble of nets.
 # The beta values obey an exponentially increasing schedule (determined by initial_beta
-# and final_beta). 
+# and final_beta).
 
 # Begin configuration section.
 cmd=run.pl
@@ -28,7 +28,7 @@ bias_stddev=0.5
 softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.
 
 combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
@@ -109,7 +109,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -126,7 +126,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -210,7 +210,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -265,7 +265,7 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].1.mdl $dir/$x.1.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     declare -A mdl
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
@@ -287,7 +287,7 @@ while [ $x -lt $num_iters ]; do
       nnets_ensemble_out="${nnets_ensemble_out} $dir/$[$x+1].JOB.$i.mdl "
     done
 
-    beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`; 
+    beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`;
 
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
@@ -301,17 +301,17 @@ while [ $x -lt $num_iters ]; do
     softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
-    for i in `seq 1 $ensemble_size`; do 
+
+    for i in `seq 1 $ensemble_size`; do
       nnets_list=
       for n in `seq 1 $num_jobs_nnet`; do
         nnets_list="$nnets_list $dir/$[$x+1].$n.$i.mdl"
@@ -335,7 +335,7 @@ done
 # Now do combination.
 # At the end, final.mdl will be a combination of the last e.g. 10 models.
 
-for i in `seq 1 $ensemble_size`; do 
+for i in `seq 1 $ensemble_size`; do
   nnets_list=()
   if [ $num_iters_final -gt $num_iters_extra ]; then
     echo "Setting num_iters_final=$num_iters_extra"
@@ -347,7 +347,7 @@ for i in `seq 1 $ensemble_size`; do
       nnets_list[$idx]=$dir/$x.$i.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
     fi
   done
-  
+
   if [ $stage -le $num_iters ]; then
     # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
     # if there are many models it can give out-of-memory error; set num-threads to 8
@@ -370,7 +370,7 @@ for i in `seq 1 $ensemble_size`; do
         --initial-model=100000 --num-lbfgs-iters=40 \
         --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
         $dir/final.$i.mdl || exit 1;
-  
+
     # Normalize stddev for affine or block affine layers that are followed by a
     # pnorm layer and then a normalize layer.
     $cmd $parallel_opts $dir/log/normalize.$i.log \
@@ -397,9 +397,9 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
-      for i in `seq 1 $ensemble_size`; do 
+      for i in `seq 1 $ensemble_size`; do
         rm $dir/$x.$i.mdl
       done
     fi
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
index d655f039e2f..497d2826f48 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 # Apache 2.0.
@@ -26,14 +26,14 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
-                                   # to scale the pre-softmax outputs  
+                                   # to scale the pre-softmax outputs
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -82,7 +82,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -126,7 +126,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -141,7 +141,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -234,7 +234,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -255,16 +255,16 @@ if [ $stage -le -1 ]; then
     echo "prepare vector assignment for FixedScaleComponent before softmax"
     echo "(use priors^$presoftmax_prior_scale_power and rescale to average 1)"
 
-    # obtains raw pdf count    
+    # obtains raw pdf count
     $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
       ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
       post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
     cat $dir/*.pacc > $dir/pacc
     rm $dir/*.pacc
     awk -v power=$presoftmax_prior_scale_power \
-      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} } 
+      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
       END {
-        for (i=2; i<=NF-1; i++) {total+=sum[i]} 
+        for (i=2; i<=NF-1; i++) {total+=sum[i]}
         ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
         for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
         rescale/=(NF-2)
@@ -345,13 +345,13 @@ while [ $x -lt $num_iters ]; do
           ark:$egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
 
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
-      
+
       inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`
       if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
         inp=$[$inp-2]
@@ -383,7 +383,7 @@ while [ $x -lt $num_iters ]; do
       perturb_suffix="-perturbed"
       perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
     fi
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -407,7 +407,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -416,7 +416,7 @@ while [ $x -lt $num_iters ]; do
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
       echo "Warning: the mix up opertion is disabled!"
-      echo "    Ignore mix up leaves number specified" 
+      echo "    Ignore mix up leaves number specified"
     fi
     rm $nnets_list
     [ ! -f $dir/$[$x+1].mdl ] && exit 1;
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
index a7494575a24..ca7be971f0d 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -12,14 +12,14 @@
 # which is faster (especially on GPUs).  The difference is that the
 # learning-rate schedule is simpler, with the learning rate exponentially
 # decreasing during training, and no phase where the learning rate is constant.
-# 
+#
 # Also, the final model-combination is done a bit differently: we combine models
 # over typically a whole epoch, and because that would be too many iterations to
 # easily be able to combine over, we arrange the iterations into groups (20
 # groups by default) and average over each group.
 #
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -29,11 +29,11 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -60,7 +60,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-4
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
@@ -77,7 +77,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -88,7 +88,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -130,7 +130,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -148,7 +148,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -269,7 +269,7 @@ fi
 
 cur_num_hidden_layer=1  # counts the number of hidden layers in the network
                         # this is different from the number of components in
-                        # in the network, each hidden layer is composed of 
+                        # in the network, each hidden layer is composed of
                         # affine comp. + pnorm comp. + normalization comp.
                         # optionally a splice component is also added
 
@@ -328,7 +328,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -370,7 +370,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -432,7 +432,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -469,7 +469,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
index 5bc49489a44..069dea9ffcc 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -21,11 +21,11 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -60,7 +60,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -72,7 +72,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -83,7 +83,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
@@ -123,7 +123,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -141,7 +141,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -328,7 +328,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -372,7 +372,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -415,7 +415,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -457,7 +457,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -494,7 +494,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
index d53bfb888db..f70a9bacbaf 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -12,14 +12,14 @@
 # (especially on GPUs).  The difference is that the learning-rate schedule is
 # simpler, with the learning rate exponentially decreasing during training,
 # and no phase where the learning rate is constant.
-# 
+#
 # Also, the final model-combination is done a bit differently: we combine models
 # over typically a whole epoch, and because that would be too many iterations to
 # easily be able to combine over, we arrange the iterations into groups (20
 # groups by default) and average over each group.
 #
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -29,12 +29,12 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -61,7 +61,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-4
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
@@ -74,7 +74,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -85,7 +85,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -127,7 +127,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -143,7 +143,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -241,7 +241,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -306,7 +306,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -348,7 +348,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -409,7 +409,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -446,7 +446,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
index 115420281b8..ada91f2765f 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
@@ -15,14 +15,14 @@
 # (especially on GPUs).  The difference is that the learning-rate schedule is
 # simpler, with the learning rate exponentially decreasing during training,
 # and no phase where the learning rate is constant.
-# 
+#
 # Also, the final model-combination is done a bit differently: we combine models
 # over typically a whole epoch, and because that would be too many iterations to
 # easily be able to combine over, we arrange the iterations into groups (20
 # groups by default) and average over each group.
 #
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -32,14 +32,14 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
                                    # to scale the pre-softmax outputs
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -83,7 +83,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -93,9 +93,9 @@ egs_dir=
 lda_opts=
 lda_dim=
 egs_opts=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
@@ -137,7 +137,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -153,7 +153,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -211,7 +211,7 @@ ivector_dim=$(cat $dir/ivector_dim) || exit 1;
 lda_dim=$(cat $dir/lda_dim) || exit 1;
 
 if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
-  echo "$0: calling get_egs2.sh"            
+  echo "$0: calling get_egs2.sh"
   steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
     --samples-per-iter $samples_per_iter --stage $get_egs_stage \
     --cmd "$cmd" $egs_opts $data $alidir $dir/egs || exit 1;
@@ -258,7 +258,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -279,16 +279,16 @@ if [ $stage -le -1 ]; then
     echo "prepare vector assignment for FixedScaleComponent before softmax"
     echo "(use priors^$presoftmax_prior_scale_power and rescale to average 1)"
 
-    # obtains raw pdf count    
+    # obtains raw pdf count
     $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
       ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
       post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
     cat $dir/*.pacc > $dir/pacc
     rm $dir/*.pacc
     awk -v power=$presoftmax_prior_scale_power \
-      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} } 
+      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
       END {
-        for (i=2; i<=NF-1; i++) {total+=sum[i]} 
+        for (i=2; i<=NF-1; i++) {total+=sum[i]}
         ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
         for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
         rescale/=(NF-2)
@@ -299,7 +299,7 @@ if [ $stage -le -1 ]; then
     echo "insert an additional layer of FixedScaleComponent before softmax"
     inp=`nnet-am-info $dir/0.mdl | grep 'Softmax' | awk '{print $2}'`
     nnet-init $dir/per_element.config - | nnet-insert --insert-at=$inp --randomize-next-component=false $dir/0.mdl - $dir/0.mdl
-  fi  
+  fi
 fi
 
 # set num_iters so that as close as possible, we process the data $num_epochs
@@ -357,7 +357,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -367,7 +367,7 @@ while [ $x -lt $num_iters ]; do
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       epoch=${realign_this_iter[$x]}
 
-             
+
 
       echo "Getting average posterior for purposes of adjusting the priors."
       # Note: this just uses CPUs, using a smallish subset of data.
@@ -403,7 +403,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -423,12 +423,12 @@ while [ $x -lt $num_iters ]; do
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
 
-      inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`        
+      inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`
       if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
         inp=$[$inp-2]
       else
         inp=$[$inp-1]
-      fi        
+      fi
       mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert --insert-at=$inp $dir/$x.mdl - - |"
     else
       mdl=$dir/$x.mdl
@@ -452,7 +452,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -494,7 +494,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -503,7 +503,7 @@ while [ $x -lt $num_iters ]; do
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
       echo "Warning: the mix up opertion is disabled!"
-      echo "    Ignore mix up leaves number specified" 
+      echo "    Ignore mix up leaves number specified"
     fi
     rm $nnets_list
     [ ! -f $dir/$[$x+1].mdl ] && exit 1;
@@ -528,7 +528,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_tanh.sh b/egs/wsj/s5/steps/nnet2/train_tanh.sh
index 7568da320ee..a6530ba4dfc 100755
--- a/egs/wsj/s5/steps/nnet2/train_tanh.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh.sh
@@ -51,7 +51,7 @@ last_layer_factor=0.1 # relates to modify_learning_rates.
 first_layer_factor=1.0 # relates to modify_learning_rates.
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -110,7 +110,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh b/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh
index b296e95416b..3254581f642 100755
--- a/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 #	    2014  Pegah Ghahremani
@@ -15,7 +15,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -47,8 +47,8 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
 
 add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
-bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer 
-                                        # eg. 1024|1024|42|1024 bottleneck_layer_num = 2  
+bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer
+                                        # eg. 1024|1024|42|1024 bottleneck_layer_num = 2
 
 modify_learning_rates=false
 last_layer_factor=0.1 # relates to modify_learning_rates.
@@ -111,7 +111,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
@@ -128,7 +128,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -145,7 +145,7 @@ done
 
 # Set some variables.
 num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
- 
+
 nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
 # in this dir we'll have just one job.
 sdata=$data/split$nj
@@ -212,7 +212,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 TanhComponent dim=$hidden_layer_dim
@@ -270,14 +270,14 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
       if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then
         echo bnf layer with x = $x
-        mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"  
+        mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"
       else
         mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
       fi
@@ -302,17 +302,17 @@ while [ $x -lt $num_iters ]; do
     last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     # The last two layers will get this (usually lower) learning rate.
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -334,7 +334,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -397,7 +397,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
@@ -407,7 +407,7 @@ fi
 name=`basename $data`
 if [ -f $dir/final.mdl ]; then
   nnet-to-raw-nnet --truncate=$truncate_comp_num $dir/final.mdl $dir/final.raw
-else 
+else
   echo "$0: we require final.mdl in source dir $dir"
 fi
 
diff --git a/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh b/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
index ec228077a4a..1be38c550d7 100755
--- a/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 
@@ -25,7 +25,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -37,7 +37,7 @@ hidden_layer_dim=300 #  You may want this larger, e.g. 1024 or 2048.
 
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh.
@@ -57,7 +57,7 @@ num_hidden_layers=3 # This is an important configuration value that you might
                     # want to tune.
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
@@ -80,7 +80,7 @@ egs_dir=
 lda_opts=
 egs_opts=
 transform_dir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -124,7 +124,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
@@ -136,7 +136,7 @@ if [ $# != 4 ]; then
   echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -153,7 +153,7 @@ done
 
 # Set some variables.
 num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
- 
+
 nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
 # in this dir we'll have just one job.
 sdata=$data/split$nj
@@ -226,7 +226,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 TanhComponent dim=$hidden_layer_dim
@@ -284,7 +284,7 @@ while [ $x -lt $num_iters ]; do
          ark:$egs_dir/train_diagnostic.egs '&&' \
          nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
@@ -324,18 +324,18 @@ while [ $x -lt $num_iters ]; do
     last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     # The last two layers will get this (usually lower) learning rate.
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
 
-    if $do_average; then    
+    if $do_average; then
       $cmd $dir/log/average.$x.log \
         nnet-am-average $nnets_list - \| \
         nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -343,7 +343,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -361,7 +361,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -444,7 +444,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet2/update_nnet.sh b/egs/wsj/s5/steps/nnet2/update_nnet.sh
index bfbc4a6592d..abcebce273a 100755
--- a/egs/wsj/s5/steps/nnet2/update_nnet.sh
+++ b/egs/wsj/s5/steps/nnet2/update_nnet.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2013  Johns Hopkins University (Author: Jan Trmal)
@@ -40,7 +40,7 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
 
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -83,7 +83,7 @@ if [ $# != 5 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -100,7 +100,7 @@ if [ $# != 5 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
   echo "  --transform-dir                                  # Directory with fMLLR transforms. Overrides alidir if provided."
-  
+
   exit 1;
 fi
 
@@ -182,12 +182,12 @@ while [ $x -lt $num_iters ]; do
       nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
     $cmd $dir/log/compute_prob_train.$x.log \
       nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
-      
+
     if [ $x -gt 0 ] ; then
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     mdl=$dir/$x.mdl
 
@@ -268,7 +268,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index cb1d7d1c357..8f4a9cc3697 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -127,10 +127,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --frames-per-iter <#frames|400000>               # Number of frames of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index f9737b4c8f4..91d1bb0268a 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -96,7 +96,7 @@ if [ $# != 5 ]; then
   echo ""
   echo "For options, see top of script file.  Standard options:"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
   echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
   echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index f74b66b5fd2..6fb9294e84c 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -69,7 +69,7 @@ if [ $# != 6 ]; then
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
   echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
   echo "                                                   # process."
diff --git a/egs/wsj/s5/steps/nnet3/lstm/train.sh b/egs/wsj/s5/steps/nnet3/lstm/train.sh
index 3f9b7bccb06..9ee0446703c 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/train.sh
+++ b/egs/wsj/s5/steps/nnet3/lstm/train.sh
@@ -143,9 +143,9 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --splice-indexes <string|\"-2,-1,0,1,2 0 0\"> "
   echo "                                                   # Frame indices used for each splice layer."
   echo "                                                   # Format : <frame_indices> .... <frame_indices> "
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
index dfe02931758..f28994ae68b 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/train.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
@@ -111,9 +111,9 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
index 8f3dac45315..b0031894a48 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
@@ -97,9 +97,9 @@ if [ $# != 3 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index 37540e488c2..eeafdfd74f5 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -109,9 +109,9 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/online/nnet2/get_egs.sh b/egs/wsj/s5/steps/online/nnet2/get_egs.sh
index 5f055b4b680..73f3581ade9 100755
--- a/egs/wsj/s5/steps/online/nnet2/get_egs.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs.sh
@@ -2,7 +2,7 @@
 
 # Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 
-# This is modified from ../../nnet2/get_egs.sh. 
+# This is modified from ../../nnet2/get_egs.sh.
 # This script combines the
 # nnet-example extraction with the feature extraction directly from wave files;
 # it uses the program online2-wav-dump-feature to do all parts of feature
@@ -24,7 +24,7 @@ samples_per_iter=400000 # each iteration of training, see this many samples
 transform_dir=     # If supplied, overrides alidir
 num_jobs_nnet=16    # Number of neural net jobs to run in parallel
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 
 echo "$0 $@"  # Print the command line for logging
@@ -56,7 +56,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -82,7 +82,7 @@ mkdir -p $dir/log
 cp $alidir/tree $dir
 grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 mkdir -p $dir/valid $dir/train_subset
 
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
@@ -175,7 +175,7 @@ mkdir -p $dir/egs
 
 if [ $stage -le 2 ]; then
   rm $dir/.error 2>/dev/null
-  
+
   echo "$0: extracting validation and training-subset alignments."
   set -o pipefail;
   for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
@@ -183,7 +183,7 @@ if [ $stage -le 2 ]; then
     utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
     gzip -c >$dir/ali_special.gz || exit 1;
   set +o pipefail; # unset the pipefail option.
-  
+
   echo "Getting validation and training subset examples."
   $cmd $dir/log/create_valid_subset.log \
     nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
@@ -252,7 +252,7 @@ if [ $stage -le 4 ]; then
     echo "$0: Since iters-per-epoch == 1, just concatenating the data."
     for n in `seq 1 $num_jobs_nnet`; do
       cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
-      remove $dir/egs/egs_orig.$n.*.ark 
+      remove $dir/egs/egs_orig.$n.*.ark
     done
   else # We'll have to split it up using nnet-copy-egs.
     egs_list=
@@ -277,7 +277,7 @@ if [ $stage -le 5 ]; then
   for n in `seq 0 $[$iters_per_epoch-1]`; do
     $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
       nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
-      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark 
+      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark
     remove $dir/egs/egs_tmp.*.$n.ark
   done
 fi
diff --git a/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
index c667e4a9284..178db7ee4c7 100755
--- a/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
@@ -37,7 +37,7 @@ if [ $# != 6 ]; then
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
   echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
   echo "                                                   # process."
diff --git a/egs/wsj/s5/utils/convert_slf_parallel.sh b/egs/wsj/s5/utils/convert_slf_parallel.sh
index 4e4ce41d236..1b242ed2c38 100755
--- a/egs/wsj/s5/utils/convert_slf_parallel.sh
+++ b/egs/wsj/s5/utils/convert_slf_parallel.sh
@@ -7,7 +7,7 @@
 # begin configuration section.
 cmd=run.pl
 dirname=lats-in-htk-slf
-parallel_opts="-tc 50" # We should limit disk stress
+parallel_opts="--max-jobs-run 50" # We should limit disk stress
 word_to_node=false # Words in arcs or nodes? [default:arcs]
 #end configuration section.
 
@@ -21,7 +21,7 @@ if [ $# -ne 3 ]; then
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --word-to-link (true|false)     # put word symbols on links or nodes."
-  echo "    --parallel-opts STR             # parallelization options (def.: '-tc 50')."
+  echo "    --parallel-opts STR             # parallelization options (def.: '--max-jobs-run 50')."
   echo "e.g.:"
   echo "$0 data/dev data/lang exp/tri4a/decode_dev"
   exit 1;

From 05463accfe41f0a35ad6330005ea1dd0de6b7a5e Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Mon, 27 Feb 2017 22:47:26 -0500
Subject: [PATCH 176/213] [scripts] Fix to nnet2 script (won't change results)
 (#1462)

---
 egs/wsj/s5/steps/nnet3/train_tdnn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index eeafdfd74f5..fbcf426b205 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -657,6 +657,6 @@ if $cleanup; then
   done
 fi
 
-steps/info/nnet3_dir_info.sh $dir
+steps/info/nnet3_dir_info.pl $dir
 
 exit 0

From 65e8a96ca570e2f04140170e11e93b89bd7c19e1 Mon Sep 17 00:00:00 2001
From: Yiming Wang <freewym@gmail.com>
Date: Tue, 28 Feb 2017 13:04:35 -0500
Subject: [PATCH 177/213] [scripts] nnet3: remove import from tdnn in the
 xconfig script since it no longer exists (#1464)

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
index 353b9d3bba4..fa356d15a18 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
@@ -5,4 +5,3 @@
 
 from basic_layers import *
 from lstm import *
-from tdnn import *

From 114abca97a8606665d42a54a63b8fcf2fe88482f Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 28 Feb 2017 13:33:21 -0500
Subject: [PATCH 178/213] [egs] Add online-decoding example in some Swbd chain
 scripts (#1466)

---
 .../s5c/local/chain/tuning/run_tdnn_7h.sh     | 69 ++++++++++++++---
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    | 74 ++++++++++++++++---
 2 files changed, 122 insertions(+), 21 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
index 59bc2c64f70..9dfaa1d4509 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -9,7 +9,14 @@
 #Final valid prob      -0.110475 -0.113102
 #Final train prob (xent)      -1.20065   -1.2533
 #Final valid prob (xent)       -1.3313  -1.36743
-#
+
+# Online decoding
+# System                tdnn_7h_sp tdnn_7h_sp_online
+# WER on train_dev(tg)      13.96     13.95
+# WER on train_dev(fg)      12.86     12.82
+# WER on eval2000(tg)        16.5      16.5
+# WER on eval2000(fg)        14.8      14.8
+
 set -e
 
 # configs for 'chain'
@@ -20,6 +27,7 @@ get_egs_stage=-10
 speed_perturb=true
 dir=exp/chain/tdnn_7h  # Note: _sp will get added to this if $speed_perturb == true.
 decode_iter=
+decode_nj=50
 
 # training options
 num_epochs=4
@@ -36,6 +44,8 @@ remove_egs=false
 common_egs_dir=
 xent_regularize=0.1
 
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -193,26 +203,65 @@ if [ $stage -le 14 ]; then
   utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
-decode_suff=sw1_tg
+
 graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
 if [ $stage -le 15 ]; then
-  iter_opts=
-  if [ ! -z $decode_iter ]; then
-    iter_opts=" --iter $decode_iter "
-  fi
+  rm $dir/.error 2>/dev/null || true
   for decode_set in train_dev eval2000; do
       (
       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
       if $has_fisher; then
           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
       fi
-      ) &
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
-wait;
+
+
 exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index f8b3d70aa2b..6987757757a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -21,7 +21,12 @@
 # Final train prob (xent)        -0.836    -0.931
 # Final valid prob (xent)       -0.9631   -1.0279
 
-
+# Online decoding
+# System                tdnn_lstm_1e_sp_online tdnn_lstm_1e_sp
+# WER on train_dev(tg)      12.93     12.74
+# WER on train_dev(fg)      12.05     11.87
+# WER on eval2000(tg)        15.5      15.4
+# WER on eval2000(fg)        14.0      13.8
 
 set -e
 
@@ -31,7 +36,8 @@ train_stage=-10
 get_egs_stage=-10
 speed_perturb=true
 dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true.
-decode_iter=final
+decode_iter=
+decode_nj=50
 
 # training options
 xent_regularize=0.01
@@ -53,7 +59,9 @@ extra_right_context=0
 
 
 remove_egs=false
-common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs
+common_egs_dir=
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -224,12 +232,18 @@ fi
 
 
 graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
 if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
   for decode_set in train_dev eval2000; do
       (
         steps/nnet3/decode.sh --num-threads 4 \
           --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --nj 25 --cmd "$decode_cmd" $iter_opts \
           --extra-left-context $extra_left_context  \
           --extra-right-context $extra_right_context  \
           --extra-left-context-initial 0 \
@@ -237,39 +251,77 @@ if [ $stage -le 15 ]; then
           --frames-per-chunk "$frames_per_chunk_primary" \
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
-         $dir/decode_${decode_set}_sw1_tg || exit 1;
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
       if $has_fisher; then
           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
       fi
       ) &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
-wait;
 
 if [ $stage -le 16 ]; then
   # looped decoding.  Note: this does not make sense for BLSTMs or other
   # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
   # point doing it because it would give identical results to regular decoding.
+  rm $dir/.error 2>/dev/null || true
   for decode_set in train_dev eval2000; do
     (
       steps/nnet3/decode_looped.sh \
          --acwt 1.0 --post-decode-acwt 10.0 \
-         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
-         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
       if $has_fisher; then
           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
       fi
       ) &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
 fi
-wait;
 
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
 
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
 
 exit 0;

From 2f9e095a220eb9df951107430753c3c1985ce15e Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Tue, 28 Feb 2017 13:38:37 -0500
Subject: [PATCH 179/213] [scripts] fix_data_dir.sh, prevent function returning
 false in case reco2file_and_channel does not exist (#1467)

---
 egs/wsj/s5/utils/fix_data_dir.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index bb8efd56ab8..cbbcbe8f8c4 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -92,7 +92,7 @@ function filter_recordings {
 
     filter_file $tmpdir/recordings $data/wav.scp
     [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
-
+    true
   fi
 }
 

From cceb9c1e78a7b1d2603cca7d90bf138960f9eb58 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Wed, 1 Mar 2017 13:43:09 -0500
Subject: [PATCH 180/213] [src] kwsbin/transcripts-to-fsts.cc, fix bug
 introduced in the last babel recipe commit (#1470)

---
 src/kwsbin/transcripts-to-fsts.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kwsbin/transcripts-to-fsts.cc b/src/kwsbin/transcripts-to-fsts.cc
index 4e7787f2642..ecf76edd757 100644
--- a/src/kwsbin/transcripts-to-fsts.cc
+++ b/src/kwsbin/transcripts-to-fsts.cc
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 
       VectorFst<StdArc> fst;
       MakeLinearAcceptor(transcript, &fst);
-      if (costs_reader.HasKey(key)) {
+      if (costs_reader.IsOpen() && costs_reader.HasKey(key)) {
         double cost = costs_reader.Value(key);
         SetLinearAcceptorWeight(cost, &fst);
       }

From 04bf5b0000f6986ccb1ece23efecbaef939943c7 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Thu, 2 Mar 2017 18:33:22 -0600
Subject: [PATCH 181/213] [src] add CUDA kernel for backprop of NormalizeLayer
 (#1458)

---
 src/cudamatrix/cu-kernels-ansi.h   |  11 +++
 src/cudamatrix/cu-kernels.cu       | 100 ++++++++++++++++++++++++++-
 src/cudamatrix/cu-kernels.h        |  17 +++++
 src/cudamatrix/cu-math-test.cc     | 104 ++++++++++++++++++++++++++++-
 src/cudamatrix/cu-math.cc          |  96 +++++++++++++++++++++++++-
 src/cudamatrix/cu-math.h           |  61 ++++++++++++-----
 src/nnet2/nnet-component.cc        |  31 +++------
 src/nnet3/nnet-simple-component.cc |  46 ++-----------
 8 files changed, 381 insertions(+), 85 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 9c274283b7e..204aa859bdf 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -705,6 +705,17 @@ void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
 void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
                               const float *v_in);
 
+void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                  int id_stride, const float *iv,
+                                  MatrixDim iv_dim, const float* od,
+                                  int od_stride, float target_rms,
+                                  bool add_log_stddev);
+void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                  int id_stride, const double *iv,
+                                  MatrixDim iv_dim, const double* od,
+                                  int od_stride, double target_rms,
+                                  bool add_log_stddev);
+
 } // extern "C"
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index a1a1e6c633b..d4b247ffaa7 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2292,7 +2292,7 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
     }
   }
 
-  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (tid == 0) {
     ssum[0] = sqrt(
         fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
@@ -2315,6 +2315,87 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
 }
 
 
+template<typename Real>
+__global__
+static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
+                                    MatrixDim iv_dim, const Real* od,
+                                    int od_stride, Real target_rms,
+                                    bool add_log_stddev) {
+
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+  const Real kInvNormFloor = 8589934592.0;
+
+  const int tid = threadIdx.x;
+  const int i = blockIdx.x;
+  const Real* iv_row = iv + i * iv_dim.stride;
+  const Real* od_row = od + i * od_stride;
+
+  // reduce to CU1DBLOCK elements per row
+  Real dot_products = Real(0);
+  Real in_norm = Real(0);
+  for (int j = tid; j < iv_dim.cols; j += CU1DBLOCK) {
+    const Real iv_ij = iv_row[j];
+    dot_products += iv_ij * od_row[j];
+    in_norm += iv_ij * iv_ij;
+  }
+  __shared__ Real sprod[CU1DBLOCK];
+  __shared__ Real snorm[CU1DBLOCK];
+  sprod[tid] = dot_products;
+  snorm[tid] = in_norm;
+  __syncthreads();
+
+  // reduce to 2x warpSize elements per row
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift) {
+      sprod[tid] += sprod[tid + shift];
+      snorm[tid] += snorm[tid + shift];
+    }
+    __syncthreads();
+  }
+
+  // reduce to 1 element per row
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      sprod[tid] += sprod[tid + shift];
+      snorm[tid] += snorm[tid + shift];
+    }
+  }
+
+  // broadcast the sum results
+  __syncthreads();
+  dot_products = sprod[0];
+  in_norm = snorm[0];
+
+  Real log_stddev_deriv;
+  if (add_log_stddev) {
+    log_stddev_deriv = Real(1) / max(in_norm, iv_dim.cols * kSquaredNormFloor)
+        * od_row[iv_dim.cols];
+  }
+
+  const Real inv_d_scaled = Real(1) / (iv_dim.cols * target_rms * target_rms);
+  in_norm = Real(1) / sqrt(max(in_norm * inv_d_scaled, kSquaredNormFloor));
+
+  const Real f = in_norm == kInvNormFloor ? Real(0) : in_norm;
+  dot_products *= f * f * f * inv_d_scaled;
+
+  for (int j = tid; j < iv_dim.cols; j += CU1DBLOCK) {
+    const Real iv_ij = iv_row[j];
+    Real id_ij = id[i * id_stride + j];
+    if (add_log_stddev) {
+      id_ij += log_stddev_deriv * iv_ij;
+    }
+    if (id != od) {
+      id_ij += in_norm * od_row[j];
+    } else {
+      id_ij *= in_norm;
+    }
+    id_ij -= dot_products * iv_ij;
+    id[i * id_stride + j] = id_ij;
+  }
+}
+
 // Per-row log-softmax operation on 'x', with writing to 'y'.
 // note, x and y may point to the same memory.  This is equivalent to setting
 // matrix y to matrix x and then, for each row of y, subtracting the offset that
@@ -4690,3 +4771,20 @@ void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
                               const float *v_in) {
   _copy_cols_from_vec<<<Gr, Bl>>>(mat_out, d_out, v_in);
 }
+
+void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                  int id_stride, const float *iv,
+                                  MatrixDim iv_dim, const float* od,
+                                  int od_stride, float target_rms,
+                                  bool add_log_stddev) {
+  _diff_normalize_per_row<<<Gr, Bl>>>(id, id_stride, iv, iv_dim, od, od_stride,
+                                      target_rms, add_log_stddev);
+}
+void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                  int id_stride, const double *iv,
+                                  MatrixDim iv_dim, const double* od,
+                                  int od_stride, double target_rms,
+                                  bool add_log_stddev) {
+  _diff_normalize_per_row<<<Gr, Bl>>>(id, id_stride, iv, iv_dim, od, od_stride,
+                                      target_rms, add_log_stddev);
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index bc0f170043d..85b5294fc59 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1348,6 +1348,23 @@ inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
   cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
 
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                        int id_stride, const double *iv,
+                                        MatrixDim iv_dim, const double* od,
+                                        int od_stride, double target_rms,
+                                        bool add_log_stddev) {
+  cudaD_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+                               target_rms, add_log_stddev);
+}
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                        int id_stride, const float *iv,
+                                        MatrixDim iv_dim, const float* od,
+                                        int od_stride, float target_rms,
+                                        bool add_log_stddev) {
+  cudaF_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+                               target_rms, add_log_stddev);
+}
+
 } // namespace kaldi
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 6b9119b42c1..21919a83043 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -510,7 +510,7 @@ static void UnitTestCuMathNormalizePerRow() {
 
     BaseFloat gflops = ((BaseFloat) dim * dim * iter)
         / (tim.Elapsed() * 1.0e+09);
-    KALDI_LOG << "For CuMatrix::NormalizePerRow"
+    KALDI_LOG << "For CuMath::NormalizePerRow"
               << (sizeof(Real)==8?"<double>":"<float>") << ", for dim = "
               << dim << ", speed was " << gflops << " gigaflops.";
     if (tim.Elapsed() > 0.05)
@@ -518,6 +518,107 @@ static void UnitTestCuMathNormalizePerRow() {
   }
 }
 
+template<typename Real>
+static void UnitTestCuDiffNormalizePerRow() {
+  for (int32 i = 0; i < 2; i++) {
+    int row = 10 + Rand() % 40;
+    int col = 10 + Rand() % 50;
+
+    Matrix<Real> Hi(row, col);
+    Matrix<Real> Ho(row, col + 1);
+    Matrix<Real> Hid(row, col);
+    Matrix<Real> Hod(row, col + 1);
+    Hi.SetRandn();
+    Hod.SetRandn();
+    Hi.Scale(5.0);
+
+    CuMatrix<Real> Di(row, col);
+    CuMatrix<Real> Do(row, col + 1);
+    CuMatrix<Real> Did(row, col);
+    CuMatrix<Real> Dod(row, col + 1);
+    Di.CopyFromMat(Hi);
+    Dod.CopyFromMat(Hod);
+
+    Real target_rms = 0.3456;
+    bool add_log_stddev = true;
+    const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+
+    //gpu
+    cu::DiffNormalizePerRow(Di, Dod, target_rms, add_log_stddev, &Did);
+
+    //cpu
+    {
+      MatrixBase<Real>* in_deriv = &Hid;
+      MatrixBase<Real>& out_deriv(Hod);
+      MatrixBase<Real>& in_value(Hi);
+
+      const SubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
+                                             0, in_value.NumCols());
+      Vector<Real> dot_products(out_deriv.NumRows());
+      dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
+                                 kTrans, 0.0);
+      Vector<Real> in_norm(in_value.NumRows());
+      Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
+      in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+      if (add_log_stddev) {
+        Vector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+        // f = log(sqrt(max(epsi, x^T x / D)))
+        // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
+        // we don't compute this exactly below for the case when x^2 x is very
+        // small, but we do make sure that the deriv isn't infinity when the input
+        // is zero.
+        log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
+        log_stddev_deriv.ApplyPow(-1.0);
+        out_deriv_for_stddev.CopyColFromMat(out_deriv,
+                                            (out_deriv.NumCols() - 1));
+        log_stddev_deriv.MulElements(out_deriv_for_stddev);
+        if (in_deriv)
+          in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans,
+                                  1.0);
+      }
+      in_norm.Scale(1.0 / d_scaled);
+      in_norm.ApplyFloor(kSquaredNormFloor);
+      in_norm.ApplyPow(-0.5);
+      if (in_deriv) {
+        if (in_deriv->Data() != out_deriv_no_log.Data())
+          in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans,
+                                  1.0);
+        else
+          in_deriv->MulRowsVec(in_norm);
+        in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+        in_norm.ApplyPow(3.0);
+        dot_products.MulElements(in_norm);
+
+        in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value,
+                                kNoTrans, 1.0);
+      }
+
+      Matrix<Real> Hid2(Did);
+      AssertEqual(Hid, Hid2, 0.00001);
+    }
+  }
+
+  for (int dim = 16; dim <= 1024; dim *= 2) {
+    BaseFloat time_in_secs = 0.025;
+    CuMatrix<Real> id(dim, dim), iv(dim, dim), od(dim, dim + 1);
+    iv.SetRandn();
+    od.SetRandn();
+    Timer tim;
+    int32 iter = 0;
+    for (; tim.Elapsed() < time_in_secs; iter++) {
+      cu::DiffNormalizePerRow(iv, od, Real(0.456), true, &id);
+    }
+    BaseFloat fdim = dim;
+    BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+    KALDI_LOG << "For CuMath::DiffNormalizePerRow"
+              << (sizeof(Real)==8?"<double>":"<float>")
+              << ", for dim = " << dim << ", speed was " << gflops
+              << " gigaflops.";
+  }
+}
+
+
 
 template<typename Real> void CudaMathUnitTest() {
 #if HAVE_CUDA == 1
@@ -531,6 +632,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestLstmNonlinearity();
   UnitTestBackpropLstmNonlinearity<Real>();
   UnitTestCuMathNormalizePerRow<Real>();
+  UnitTestCuDiffNormalizePerRow<Real>();
 }
 
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index bb55302313a..2bd184bf116 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -245,7 +245,7 @@ void Randomize(const CuMatrixBase<double> &src,
 template<typename Real>
 void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
                      const bool add_log_stddev, CuMatrixBase<Real>* out) {
-  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (add_log_stddev) {
     KALDI_ASSERT(in.NumRows() == out->NumRows());
     KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
@@ -291,6 +291,100 @@ void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
                      const bool add_log_stddev, CuMatrixBase<double>* out);
 
 
+// A note on the derivative of NormalizeComponent...
+// let both row_in and row_out be vectors of dimension D.
+// Let p = row_in^T row_in / (D * target_rms^2), and let
+// f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
+// row_out = f row_in.
+// Suppose we have a quantity deriv_out which is the derivative
+// of the objective function w.r.t. row_out.  We want to compute
+// deriv_in which is the derivative of the objective function w.r.t.
+// row_in.  Let the objective function be F.  One term is obvious: we have
+// deriv_in = f deriv_out + ....
+// next we have to take into account the derivative that gets back-propagated
+// through f.  Obviously, dF/df = deriv_out^T row_in.
+// And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+// and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
+// So this term in dF/d(row_in) equals:
+// dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+// So
+// deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
+//  if add_log_stddev_ true, the deriv_in has another term as
+// dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
+template<typename Real>
+void DiffNormalizePerRow(const CuMatrixBase<Real> &in_value,
+                         const CuMatrixBase<Real> &out_deriv,
+                         const Real target_rms, const bool add_log_stddev,
+                         CuMatrixBase<Real>* in_deriv) {
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    size_t dimBlock = CU1DBLOCK;
+    size_t dimGrid = in_deriv->NumRows();
+    cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
+                                in_deriv->Stride(), in_value.Data(),
+                                in_value.Dim(), out_deriv.Data(),
+                                out_deriv.Stride(), target_rms, add_log_stddev);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
+                                             0, in_value.NumCols());
+    CuVector<Real> dot_products(out_deriv.NumRows());
+    dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
+                               kTrans, 0.0);
+    CuVector<Real> in_norm(in_value.NumRows());
+    Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
+    in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+
+    if (add_log_stddev) {
+      CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+      out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+      // f = log(sqrt(max(epsi, x^T x / D)))
+      // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
+      // we don't compute this exactly below for the case when x^2 x is very
+      // small, but we do make sure that the deriv isn't infinity when the input
+      // is zero.
+      log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
+      log_stddev_deriv.ApplyPow(-1.0);
+      out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
+      log_stddev_deriv.MulElements(out_deriv_for_stddev);
+      if (in_deriv)
+        in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
+    }
+    in_norm.Scale(1.0 / d_scaled);
+    in_norm.ApplyFloor(kSquaredNormFloor);
+    in_norm.ApplyPow(-0.5);
+    if (in_deriv) {
+      if (in_deriv->Data() != out_deriv_no_log.Data())
+        in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
+      else
+        in_deriv->MulRowsVec(in_norm);
+      in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+      in_norm.ApplyPow(3.0);
+      dot_products.MulElements(in_norm);
+
+      in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
+                              1.0);
+    }
+  }
+}
+
+template
+void DiffNormalizePerRow(const CuMatrixBase<float> &in_value,
+                         const CuMatrixBase<float> &out_deriv,
+                         const float target_rms, const bool add_log_stddev,
+                         CuMatrixBase<float>* in_deriv);
+template
+void DiffNormalizePerRow(const CuMatrixBase<double> &in_value,
+                         const CuMatrixBase<double> &out_deriv,
+                         const double target_rms, const bool add_log_stddev,
+                         CuMatrixBase<double>* in_deriv);
+
+
 // not calling this Sigmoid to reduce the chance of future collisions.
 template<typename Real>
 static inline Real ScalarSigmoid(Real a) {
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 9952ca5b9d2..b0e0c2a1ff2 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -196,24 +196,6 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
                      processed outside this function into self-repair stats for
                      diagnostics.
 */
-/// Normalize nonlinearity modifies the vector of activations
-/// by scaling it so that the root-mean-square equals 1.0.
-///
-/// The output y_i = scale * x_i,
-/// and we want to RMS value of the y_i to equal target_rms,
-/// so y^t y = D * target_rms^2 (if y is one row of the input).
-/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
-/// there is also flooring involved, to avoid division-by-zero
-/// problems.  It's important for the backprop, that the floor's
-/// square root is exactly representable as float.
-/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
-/// is an extra dimension of the output.
-template<typename Real>
-void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
-                     const bool add_log_stddev, CuMatrixBase<Real>* out);
-
-
-
 template<typename Real>
 void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                               const CuMatrixBase<Real> &params,
@@ -241,6 +223,49 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
                                  MatrixBase<double> *deriv_sum_out,
                                  MatrixBase<Real> *self_repair_sum_out);
 
+/// Normalize nonlinearity modifies the vector of activations
+/// by scaling it so that the root-mean-square equals 1.0.
+///
+/// The output y_i = scale * x_i,
+/// and we want to RMS value of the y_i to equal target_rms,
+/// so y^t y = D * target_rms^2 (if y is one row of the input).
+/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+/// there is also flooring involved, to avoid division-by-zero
+/// problems.  It's important for the backprop, that the floor's
+/// square root is exactly representable as float.
+/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+/// is an extra dimension of the output.
+template<typename Real>
+void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
+                     const bool add_log_stddev, CuMatrixBase<Real>* out);
+
+// A note on the derivative of NormalizeComponent...
+// let both row_in and row_out be vectors of dimension D.
+// Let p = row_in^T row_in / (D * target_rms^2), and let
+// f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
+// row_out = f row_in.
+// Suppose we have a quantity deriv_out which is the derivative
+// of the objective function w.r.t. row_out.  We want to compute
+// deriv_in which is the derivative of the objective function w.r.t.
+// row_in.  Let the objective function be F.  One term is obvious: we have
+// deriv_in = f deriv_out + ....
+// next we have to take into account the derivative that gets back-propagated
+// through f.  Obviously, dF/df = deriv_out^T row_in.
+// And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+// and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
+// So this term in dF/d(row_in) equals:
+// dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+// So
+// deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
+//  if add_log_stddev_ true, the deriv_in has another term as
+// dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
+template<typename Real>
+void DiffNormalizePerRow(const CuMatrixBase<Real> &in_value,
+                         const CuMatrixBase<Real> &out_deriv,
+                         const Real target_rms, const bool add_log_stddev,
+                         CuMatrixBase<Real>* in_deriv);
+
+
 } // namespace cu
 } // namespace kaldi
 
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index 27ce3111b74..8696944aa3c 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -597,29 +597,16 @@ row_out = f row_in.
 
 */
 
-void NormalizeComponent::Backprop(const ChunkInfo &,  // in_info,
-                                  const ChunkInfo &,  // out_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &out_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *to_update,
-                                    // may be identical to "this".
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
+void NormalizeComponent::Backprop(
+    const ChunkInfo &,  // in_info,
+    const ChunkInfo &,  // out_info,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv, Component *to_update,
+    // may be identical to "this".
+    CuMatrix<BaseFloat> *in_deriv) const {
   in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-
-  CuVector<BaseFloat> in_norm(in_value.NumRows());
-  in_norm.AddDiagMat2(1.0 / in_value.NumCols(),
-                      in_value, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kNormFloor);
-  in_norm.ApplyPow(-0.5);
-  in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv, kNoTrans, 0.0);
-  in_norm.ReplaceValue(1.0 / sqrt(kNormFloor), 0.0);
-  in_norm.ApplyPow(3.0);
-  CuVector<BaseFloat> dot_products(in_deriv->NumRows());
-  dot_products.AddDiagMatMat(1.0, out_deriv, kNoTrans, in_value, kTrans, 0.0);
-  dot_products.MulElements(in_norm);
-
-  in_deriv->AddDiagVecMat(-1.0 / in_value.NumCols(), dot_products, in_value, kNoTrans, 1.0);
+  cu::DiffNormalizePerRow(in_value, out_deriv, BaseFloat(1), false, in_deriv);
 }
 
 void SigmoidComponent::Propagate(const ChunkInfo &in_info,
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 93cbe467a73..d352c4ae282 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -449,48 +449,10 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
                                   const CuMatrixBase<BaseFloat> &out_deriv,
                                   Component *to_update,
                                   CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)  return;
-  const CuSubMatrix<BaseFloat> out_deriv_no_log(out_deriv,
-                                                0, out_deriv.NumRows(),
-                                                0, input_dim_);
-  CuVector<BaseFloat> dot_products(out_deriv.NumRows());
-  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans,
-                             in_value, kTrans, 0.0);
-  CuVector<BaseFloat> in_norm(in_value.NumRows());
-  BaseFloat d_scaled = (in_value.NumCols() * target_rms_ * target_rms_);
-  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
-
-  if (add_log_stddev_) {
-    CuVector<BaseFloat> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
-        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
-    // f = log(sqrt(max(epsi, x^T x / D)))
-    // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
-    // we don't compute this exactly below for the case wehn x^2 x is very
-    // small, but we do make sure that the deriv isn't infinity when the input
-    // is zero.
-    log_stddev_deriv.ApplyFloor(input_dim_ * kSquaredNormFloor);
-    log_stddev_deriv.ApplyPow(-1.0);
-    out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
-    log_stddev_deriv.MulElements(out_deriv_for_stddev);
-    if (in_deriv)
-      in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
-  }
-  in_norm.Scale(1.0 / d_scaled);
-  in_norm.ApplyFloor(kSquaredNormFloor);
-  in_norm.ApplyPow(-0.5);
-  if (in_deriv) {
-    if (in_deriv->Data() != out_deriv_no_log.Data())
-      in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
-    else
-      in_deriv->MulRowsVec(in_norm);
-    in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
-    in_norm.ApplyPow(3.0);
-    dot_products.MulElements(in_norm);
-
-    in_deriv->AddDiagVecMat(-1.0 / d_scaled,
-                            dot_products, in_value,
-                            kNoTrans, 1.0);
-  }
+  if (!in_deriv)
+    return;
+  cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_,
+                          in_deriv);
 }
 
 void SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,

From 55227f21712f021fc854c34fe02642bab75255dd Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Fri, 3 Mar 2017 22:40:18 -0500
Subject: [PATCH 182/213] [src] fix kws pipeline after upgrading to openfst-1.6
 (#1471)

---
 src/fstbin/fsts-union.cc      | 14 ++++++++------
 src/kwsbin/kws-index-union.cc |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/fstbin/fsts-union.cc b/src/fstbin/fsts-union.cc
index 489d7362453..ed68cea76e9 100644
--- a/src/fstbin/fsts-union.cc
+++ b/src/fstbin/fsts-union.cc
@@ -71,9 +71,10 @@ int main(int argc, char *argv[]) {
       } else {
         if (res_key != "") {
           VectorFst<StdArc> out_fst;
-          fst::Minimize(&res_fst);
-          fst::RmEpsilon(&res_fst);
-          fst_writer.Write(res_key, res_fst);
+          fst::Determinize(res_fst, &out_fst);
+          fst::Minimize(&out_fst);
+          fst::RmEpsilon(&out_fst);
+          fst_writer.Write(res_key, out_fst);
           n_out_done++;
         }
         res_fst = fst;
@@ -82,9 +83,10 @@ int main(int argc, char *argv[]) {
     }
     if (res_key != "") {
       VectorFst<StdArc> out_fst;
-      fst::Minimize(&res_fst);
-      fst::RmEpsilon(&res_fst);
-      fst_writer.Write(res_key, res_fst);
+      fst::Determinize(res_fst, &out_fst);
+      fst::Minimize(&out_fst);
+      fst::RmEpsilon(&out_fst);
+      fst_writer.Write(res_key, out_fst);
       n_out_done++;
     }
 
diff --git a/src/kwsbin/kws-index-union.cc b/src/kwsbin/kws-index-union.cc
index 4a0f3ccea1d..cd82edee6bc 100644
--- a/src/kwsbin/kws-index-union.cc
+++ b/src/kwsbin/kws-index-union.cc
@@ -92,7 +92,7 @@ int main(int argc, char *argv[]) {
                    << " (should affect speed of search but not results)";
         global_index = ifst;
       }
-      Minimize(&global_index);
+      Minimize(&global_index, static_cast<KwsLexicographicFst*>(NULL), kDelta, true);
       Decode(&global_index, encoder);
     } else {
       KALDI_LOG << "Skipping index optimization...";

From 1047a5a5791b29fc12f0d6584499071e215f1d03 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Fri, 3 Mar 2017 22:58:38 -0500
Subject: [PATCH 183/213] [egs] Add discriminative training script for
 WSJ/TDNN-LSTM (#1468)

---
 egs/wsj/s5/local/nnet3/compare_wer.sh         |   6 +-
 egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh  |   1 +
 .../nnet3/tuning/run_tdnn_lstm_1a_disc.sh     | 227 ++++++++++++++++++
 3 files changed, 231 insertions(+), 3 deletions(-)
 create mode 120000 egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh
 create mode 100755 egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh

diff --git a/egs/wsj/s5/local/nnet3/compare_wer.sh b/egs/wsj/s5/local/nnet3/compare_wer.sh
index 255cbabaef3..7a2fbd8a123 100755
--- a/egs/wsj/s5/local/nnet3/compare_wer.sh
+++ b/egs/wsj/s5/local/nnet3/compare_wer.sh
@@ -78,7 +78,7 @@ for n in 0 1 2 3 4 5 6 7; do
      set_names $x  # sets $dirname and $epoch_infix
      decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
 
-     wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+     wer=$(cat $dirname/decode_${decode_names[$n]}$epoch_infix/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
      printf "% 10s" $wer
    done
    echo
@@ -86,7 +86,7 @@ for n in 0 1 2 3 4 5 6 7; do
      echo -n "#             [looped:]    "
      for x in $*; do
        set_names $x  # sets $dirname and $epoch_infix
-       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}$epoch_infix/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
        printf "% 10s" $wer
      done
      echo
@@ -95,7 +95,7 @@ for n in 0 1 2 3 4 5 6 7; do
      echo -n "#             [online:]    "
      for x in $*; do
        set_names $x  # sets $dirname and $epoch_infix
-       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}$epoch_infix/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
        printf "% 10s" $wer
      done
      echo
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh
new file mode 120000
index 00000000000..50d28fb91f3
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a_disc.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
new file mode 100755
index 00000000000..6b1f98f04e7
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+### This script is not tested ###
+
+# This script does discriminative training on top of CE nnet3 system.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=60 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# originally ran with effective_learning_rate=0.000005,
+# changing to effective_learning_rate=0.0000025 and using affix=slow
+
+# you can set --disc-affix if you run different configurations.
+disc_affix=
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # for alignments and denlat creation.
+frames_per_chunk_decoding=40  # for decoding; should be the same as the value
+                              # used in the script that trained the nnet.
+                              # We didn't set the frames_per_chunk in
+                              # run_tdnn_lstm_1a.sh, so it defaults to 50.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## note: --extra-left-context should be the same as the chunk_left_context (or in
+## general, the argument of --egs.chunk-left-context) in the baseline script.
+extra_left_context=40
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.0000025
+last_layer_factor=0.5
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+gmm_dir=exp/tri4b
+srcdir=exp/nnet3/tdnn_lstm1a_sp
+train_data_dir=data/${train_set}_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_${train_set}_sp_hires
+dir=${srcdir}_${criterion}${disc_affix}
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [[ $stage -le 2  && ! -f ${srcdir}/final.mdl ]]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist for any stage <= 2"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --last-layer-factor $last_layer_factor \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for data in $test_sets; do
+      (
+        data_affix=$(echo $data | sed s/test_//)
+        nj=$(wc -l <data/${data}_hires/spk2utt)
+        for iter in epoch$x epoch${x}_adj; do
+          for lmtype in tgpr bd_tgpr; do
+            graph_dir=$gmm_dir/graph_${lmtype}
+            steps/nnet3/decode.sh \
+              --extra-left-context $extra_left_context \
+              --extra-right-context $extra_right_context \
+              --extra-left-context-initial 0 \
+              --extra-right-context-final 0 \
+              --frames-per-chunk $frames_per_chunk_decoding \
+              --nj $nj --cmd "$decode_cmd" --num-threads 4 --iter $iter \
+              --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+              $graph_dir data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}_${iter} || exit 1
+          done
+          steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+            data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix}_${iter} || exit 1
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_test_bd_{tgpr,fgconst} \
+           data/${data}_hires ${dir}/decode_bd_tgpr_${data_affix}{,_fg}_${iter} || exit 1
+        done
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for data in $test_sets; do
+      (
+        data_affix=$(echo $data | sed s/test_//)
+        nj=$(wc -l <data/${data}_hires/spk2utt)
+        for iter in epoch$x epoch${x}_adj; do
+          for lmtype in tgpr bd_tgpr; do
+            graph_dir=$gmm_dir/graph_${lmtype}
+            steps/nnet3/decode_looped.sh \
+              --frames-per-chunk 30 \
+              --nj $nj --cmd "$decode_cmd" --iter $iter \
+              --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+              $graph_dir data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}_${iter} || exit 1
+          done
+          steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+            data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix}_${iter} || exit 1
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_test_bd_{tgpr,fgconst} \
+           data/${data}_hires ${dir}/decode_looped_bd_tgpr_${data_affix}{,_fg}_${iter} || exit 1
+        done
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+exit 0;

From 7d78502c305df2c03624a43b20ca5866dac8678e Mon Sep 17 00:00:00 2001
From: Andrew <qacollective@users.noreply.github.com>
Date: Tue, 7 Mar 2017 13:26:27 +1100
Subject: [PATCH 184/213] [src] bug-fix in gst plugin code (issue on g++ 5.4.0)
 (#1479)

---
 src/gst-plugin/gst-online-gmm-decode-faster.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.cc b/src/gst-plugin/gst-online-gmm-decode-faster.cc
index 040a04b53c2..958bce41d80 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.cc
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.cc
@@ -287,7 +287,7 @@ gst_online_gmm_decode_faster_init(GstOnlineGmmDecodeFaster * filter) {
   std::vector<std::pair<std::string, SimpleOptions::OptionInfo> > option_info_list;
   option_info_list = filter->simple_options_->GetOptionInfoList();
   int32 i = 0;
-  for (vector<std::pair<std::string,
+  for (std::vector<std::pair<std::string,
       SimpleOptions::OptionInfo> >::iterator dx = option_info_list.begin();
       dx != option_info_list.end(); dx++) {
     std::pair<std::string, SimpleOptions::OptionInfo> result = (*dx);
@@ -747,7 +747,7 @@ gst_online_gmm_decode_faster_loop(GstOnlineGmmDecodeFaster * filter) {
       std::vector<int32> word_ids;
       filter->decoder_->FinishTraceBack(filter->out_fst_);
       fst::GetLinearSymbolSequence(*(filter->out_fst_),
-                                   static_cast<vector<int32> *>(0),
+                                   static_cast<std::vector<int32> *>(0),
                                    &word_ids,
                                    static_cast<LatticeArc::Weight*>(0));
       gst_online_gmm_decode_faster_push_words(filter, filter->srcpad_, word_ids, filter->word_syms_, partial_res || word_ids.size());
@@ -758,7 +758,7 @@ gst_online_gmm_decode_faster_loop(GstOnlineGmmDecodeFaster * filter) {
       std::vector<int32> word_ids;
       if (filter->decoder_->PartialTraceback(filter->out_fst_)) {
         fst::GetLinearSymbolSequence(*(filter->out_fst_),
-                                     static_cast<vector<int32> *>(0),
+                                     static_cast<std::vector<int32> *>(0),
                                      &word_ids,
                                      static_cast<LatticeArc::Weight*>(0));
         gst_online_gmm_decode_faster_push_words(filter, filter->srcpad_, word_ids, filter->word_syms_, false);

From 02ca38061cd61b358da791e1dec220414c8e04e6 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 7 Mar 2017 00:31:17 -0500
Subject: [PATCH 185/213] [scripts,egs] sMBR on LFR xent system with shifted
 feats (#1477)

---
 egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh |   1 +
 .../local/nnet3/tuning/run_tdnn_lfr1c_disc.sh | 210 ++++++++++++++++++
 .../s5/steps/online/nnet2/copy_ivector_dir.sh |  48 ++++
 .../s5/utils/data/shift_and_combine_feats.sh  |  12 +
 4 files changed, 271 insertions(+)
 create mode 120000 egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh
 create mode 100755 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
 create mode 100755 egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh

diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh
new file mode 120000
index 00000000000..3fef5cbd9fe
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lfr1c_disc.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
new file mode 100755
index 00000000000..734c5a5d1be
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
@@ -0,0 +1,210 @@
+#!/bin/bash
+
+# This script does discriminative training on top of the CE nnet3 LFR system
+# from run_tdnn_lfr1c. To simplify things, this assumes you are using the
+# "speed-perturbed" data
+# (--speed_perturb true, which is the default) in the baseline run_tdnn_d.sh script.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+
+# Comparing effect of shift:
+# System                tdnn_lfr1c_sp_smbr:1 tdnn_lfr1c_sp_smbr:2 tdnn_lfr1c_sp_smbr:3 tdnn_lfr1c_sp_fs_smbr:1 tdnn_lfr1c_sp_fs_smbr:2 tdnn_lfr1c_sp_fs_smbr:3
+# WER on train_dev(tg)      16.26     16.11     16.02     16.02     15.77     15.78
+# WER on train_dev(fg)      15.01     14.91     14.80     14.79     14.58     14.50
+# WER on eval2000(tg)        18.9      18.7      18.6      18.6      18.5      18.5
+# WER on eval2000(fg)        17.4      17.2      17.1      17.1      17.0      16.9
+
+
+set -e
+set -uo pipefail
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=65 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
+# originally ran with no affix, with effective_learning_rate=0.0000125;
+# reran by mistake with no affix with effective_learning_rate=0.000005 [was a bit
+# better, see NOTES, but still best after 1st epoch].
+# reran again with affix=slow and effective_learning_rate=0.0000025
+# reran again with affix=slow2 and effective_learning_rate=0.00000125 (this was
+# about the best).
+# before checking in the script, removed the slow2 affix but left with
+# the lowest learning rate.
+disc_affix=
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_decoding=200
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## Note: extra-left-context and extra-right-context are 0 because this is a TDNN,
+## it's not a recurrent model like an LSTM or BLSTM.
+extra_left_context=0
+extra_right_context=0
+
+
+## Nnet training options
+effective_learning_rate=0.00000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+shift_feats=false
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/tdnn_lfr1c_sp
+graph_dir=$srcdir/graph_sw1_tg
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+dir=${srcdir}_${criterion}${disc_affix}
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+
+frame_subsampling_factor=1
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+fi
+
+affix=    # Will be set if doing input frame shift
+if [[ "$shift_feats" = true && $frame_subsampling_factor -ne 1 ]]; then
+  if [ $stage -le 0 ]; then
+    utils/data/shift_and_combine_feats.sh --write-utt2orig $dir/utt2orig \
+					  $frame_subsampling_factor $train_data_dir ${train_data_dir}_fs
+    steps/online/nnet2/copy_ivector_dir.sh --utt2orig $dir/utt2orig \
+					  $online_ivector_dir ${online_ivector_dir}_fs
+    rm $dir/utt2orig
+  fi
+  online_ivector_dir=${online_ivector_dir}_fs
+  train_data_dir=${train_data_dir}_fs
+  affix=_fs
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=0.333 --self-loop-scale=0.333' \
+    --frames-per-chunk $frames_per_chunk_decoding \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali${affix} ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --self-loop-scale 0.333 --acwt 0.333 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali${affix} ${srcdir}_degs${affix} || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs${affix}
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --acoustic-scale 0.333 \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      for iter in epoch$x epoch${x}_adj; do
+        (
+          steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+            --acwt 0.333 --post-decode-acwt 3.0 \
+            --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+            $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
+
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_${iter} || exit 1;
+        ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 5 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+wait;
+exit 0;
diff --git a/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh b/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh
new file mode 100755
index 00000000000..b70e5cf21ad
--- /dev/null
+++ b/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2017  Johns Hopkins University (author: Hossein Hadian)
+# Apache 2.0
+
+# This script copies the necessary parts of an online ivector directory
+# optionally applying a mapping to the ivector_online.scp file
+
+utt2orig=
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 exp/nnet3/online_ivector_train exp/nnet3/online_ivector_train_fs"
+  echo "Options"
+  echo "   --utt2orig=<file>     # utterance id mapping to use"
+  exit 1;
+fi
+
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/ivector_period ]; then
+  echo "$0: no such file $srcdir/ivector_period"
+  exit 1;
+fi
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+set -e;
+
+mkdir -p $destdir
+cp -r $srcdir/{conf,ivector_period} $destdir
+if [ -z $utt2orig ]; then
+  cp $srcdir/ivector_online.scp $destdir
+else
+  utils/apply_map.pl -f 2 $srcdir/ivector_online.scp < $utt2orig > $destdir/ivector_online.scp
+fi
+cp $srcdir/final.ie.id $destdir
+
+echo "$0: Copied necessary parts of online ivector directory $srcdir to $destdir"
diff --git a/egs/wsj/s5/utils/data/shift_and_combine_feats.sh b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
index 1a15b324ee8..217b7768078 100755
--- a/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
+++ b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
@@ -4,6 +4,11 @@
 
 # Apache 2.0
 
+write_utt2orig=              # if provided, this script will write
+                             # a mapping of shifted utterance ids
+                             # to the original ones into the file
+                             # specified by this option
+
 echo "$0 $@"  # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . utils/parse_options.sh
@@ -34,11 +39,18 @@ if [ -f $destdir/feats.scp ]; then
   exit 1
 fi
 
+if [ ! -z $write_utt2orig ]; then
+  awk '{print $1 " " $1}' $srcdir/feats.scp >$write_utt2orig
+fi
+
 tmp_shift_destdirs=()
 for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do
   if [ "$frame_shift" == 0 ]; then continue; fi
   utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1
   tmp_shift_destdirs+=("${destdir}_fs$frame_shift")
+  if [ ! -z $write_utt2orig ]; then
+    awk -v prefix="fs$frame_shift-" '{printf("%s%s %s\n", prefix, $1, $1);}' $srcdir/feats.scp >>$write_utt2orig
+  fi  
 done
 utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1
 rm -r ${tmp_shift_destdirs[@]}

From b00b9e26c16f6da05fcd79bfe82cc6686151e621 Mon Sep 17 00:00:00 2001
From: baali <choudhary.shantanu@gmail.com>
Date: Thu, 9 Mar 2017 09:28:53 +0530
Subject: [PATCH 186/213] [egs] Minor cosmetic changes in voxforge example
 script (#1483)

---
 egs/voxforge/s5/path.sh | 2 +-
 egs/voxforge/s5/run.sh  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/egs/voxforge/s5/path.sh b/egs/voxforge/s5/path.sh
index d5ee6268bae..ff3c4ab6f14 100755
--- a/egs/voxforge/s5/path.sh
+++ b/egs/voxforge/s5/path.sh
@@ -5,7 +5,7 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 . $KALDI_ROOT/tools/config/common_path.sh
 
 # VoxForge data will be stored in:
-export DATA_ROOT="/home/dpovey/kaldi-clean/egs/voxforge/s5/voxforge"    # e.g. something like /media/secondary/voxforge
+# export DATA_ROOT="$KALDI_ROOT/egs/voxforge/s5/voxforge"    # e.g. something like /media/secondary/voxforge
 
 if [ -z $DATA_ROOT ]; then
   echo "You need to set \"DATA_ROOT\" variable in path.sh to point to the directory to host VoxForge's data"
diff --git a/egs/voxforge/s5/run.sh b/egs/voxforge/s5/run.sh
index 280e47328b5..3eaad3d1075 100755
--- a/egs/voxforge/s5/run.sh
+++ b/egs/voxforge/s5/run.sh
@@ -5,8 +5,9 @@
 
 # NOTE: You will want to download the data set first, before executing this script.
 #       This can be done for example by:
-#       1. Setting the DATA_ROOT variable to point to a directory with enough free
-#          space (at least 20-25GB currently (Feb 2014))
+#       1. Setting the DATA_ROOT in path.sh variable to point to a
+#          directory with enough free space (at least 20-25GB
+#          currently (Feb 2014))
 #       2. Running "getdata.sh"
 
 # The second part of this script comes mostly from egs/rm/s5/run.sh

From 0ebbc74d7d4ed9fca85e2a76d526dbe5cc5636cc Mon Sep 17 00:00:00 2001
From: baali <choudhary.shantanu@gmail.com>
Date: Sat, 11 Mar 2017 01:40:10 +0530
Subject: [PATCH 187/213] [egs] Fixes to URLs in vystadial example script.

---
 egs/voxforge/s5/run.sh                | 2 +-
 egs/vystadial_cz/online_demo/Makefile | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/voxforge/s5/run.sh b/egs/voxforge/s5/run.sh
index 3eaad3d1075..277d41039ea 100755
--- a/egs/voxforge/s5/run.sh
+++ b/egs/voxforge/s5/run.sh
@@ -5,7 +5,7 @@
 
 # NOTE: You will want to download the data set first, before executing this script.
 #       This can be done for example by:
-#       1. Setting the DATA_ROOT in path.sh variable to point to a
+#       1. Setting the variable DATA_ROOT in path.sh to point to a
 #          directory with enough free space (at least 20-25GB
 #          currently (Feb 2014))
 #       2. Running "getdata.sh"
diff --git a/egs/vystadial_cz/online_demo/Makefile b/egs/vystadial_cz/online_demo/Makefile
index 14042f5a70a..49e3a85fa43 100644
--- a/egs/vystadial_cz/online_demo/Makefile
+++ b/egs/vystadial_cz/online_demo/Makefile
@@ -1,6 +1,6 @@
 BEST_LINE=18
-MODEL_PREFIX_URL=http://vystadial.ms.mff.cuni.cz/download/kaldi/src/pykaldi/pykaldi/binutils/
-DATA_PREFIX_URL=http://vystadial.ms.mff.cuni.cz/download/kaldi/src/pykaldi/pykaldi/binutils/
+MODEL_PREFIX_URL=https://vystadial.ms.mff.cuni.cz/download/pykaldi/egs/vystadial/online_demo
+DATA_PREFIX_URL=https://vystadial.ms.mff.cuni.cz/download/pykaldi/egs/vystadial/online_demo
 
 # Czech language models 
 LANG=cs

From 7d79572dac78642a3eb01b2c3e6e3e971646c9d4 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Mon, 13 Mar 2017 19:31:04 +0100
Subject: [PATCH 188/213] [src] nnet1: fixing issue in multi-task training
 (#1491)

---
 src/nnet/nnet-loss.cc | 36 ++++++++++++++++++++++++++++++------
 src/nnet/nnet-loss.h  |  2 ++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/src/nnet/nnet-loss.cc b/src/nnet/nnet-loss.cc
index ba529fcb556..0c1bcfbe4b7 100644
--- a/src/nnet/nnet-loss.cc
+++ b/src/nnet/nnet-loss.cc
@@ -367,17 +367,41 @@ void MultiTaskLoss::Eval(const VectorBase<BaseFloat> &frame_weights,
   // allocate diff matrix,
   diff->Resize(num_frames, num_output);
 
+  /// One vector of frame_weights per loss-function,
+  /// The original frame weights are multiplied with
+  /// a mask of `defined targets' according to the 'Posterior'.
+  std::vector<Vector<BaseFloat> > frmwei_have_tgt;
+  for (int32 l = 0; l < loss_vec_.size(); l++) {
+    // copy original weights,
+    frmwei_have_tgt.push_back(Vector<BaseFloat>(frame_weights));
+    // We need to mask-out the frames for which the 'posterior' is not defined (= is empty):
+    int32 loss_beg = loss_dim_offset_[l];   // first column of loss target,
+    int32 loss_end = loss_dim_offset_[l+1]; // (last+1) column of loss target,
+    for (int32 f = 0; f < num_frames; f++) {
+      bool tgt_defined = false;
+      for (int32 p = 0; p < post[f].size(); p++) {
+        if (post[f][p].first >= loss_beg && post[f][p].first < loss_end) {
+          tgt_defined = true;
+          break;
+        }
+      }
+      if (!tgt_defined) {
+        frmwei_have_tgt[l](f) = 0.0; // set zero_weight for the frame with no targets!
+      }
+    }
+  }
+
   // call the vector of loss functions,
   CuMatrix<BaseFloat> diff_aux;
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    loss_vec_[i]->Eval(frame_weights,
-      net_out.ColRange(loss_dim_offset_[i], loss_dim_[i]),
-      tgt_mat_.ColRange(loss_dim_offset_[i], loss_dim_[i]),
+  for (int32 l = 0; l < loss_vec_.size(); l++) {
+    loss_vec_[l]->Eval(frmwei_have_tgt[l],
+      net_out.ColRange(loss_dim_offset_[l], loss_dim_[l]),
+      tgt_mat_.ColRange(loss_dim_offset_[l], loss_dim_[l]),
       &diff_aux);
     // Scale the gradients,
-    diff_aux.Scale(loss_weights_[i]);
+    diff_aux.Scale(loss_weights_[l]);
     // Copy to diff,
-    diff->ColRange(loss_dim_offset_[i], loss_dim_[i]).CopyFromMat(diff_aux);
+    diff->ColRange(loss_dim_offset_[l], loss_dim_[l]).CopyFromMat(diff_aux);
   }
 }
 
diff --git a/src/nnet/nnet-loss.h b/src/nnet/nnet-loss.h
index 1e0558f1b39..56bd9ac0222 100644
--- a/src/nnet/nnet-loss.h
+++ b/src/nnet/nnet-loss.h
@@ -90,6 +90,7 @@ class Xent : public LossItf {
 
   /// Get loss value (frame average),
   BaseFloat AvgLoss() {
+    if (frames_.Sum() == 0) return 0.0;
     return (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
   }
 
@@ -151,6 +152,7 @@ class Mse : public LossItf {
 
   /// Get loss value (frame average),
   BaseFloat AvgLoss() {
+    if (frames_ == 0) return 0.0;
     return loss_ / frames_;
   }
 

From 7c171a5e942cbcc7a284661b8f66e7329bb83e44 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Tue, 14 Mar 2017 10:00:34 -0700
Subject: [PATCH 189/213] [build] Bump OpenFst version to v1.6.2 (#1492)

---
 .gitignore     | 3 ++-
 tools/Makefile | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index fd1f73af215..0a0a9f2c3fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -94,6 +94,8 @@ GSYMS
 /tools/openfst-1.6.0/
 /tools/openfst-1.6.1.tar.gz
 /tools/openfst-1.6.1/
+/tools/openfst-1.6.2.tar.gz
+/tools/openfst-1.6.2/
 /tools/pa_stable_v19_20111121.tgz
 /tools/portaudio/
 /tools/sctk-2.4.0-20091110-0958.tar.bz2
@@ -131,4 +133,3 @@ GSYMS
 /tools/sequitur-g2p/
 
 /kaldiwin_vs*
-
diff --git a/tools/Makefile b/tools/Makefile
index b3d5a6c53b9..c84468ffef5 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -7,7 +7,7 @@ CC = gcc         # used for sph2pipe
 
 # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
-OPENFST_VERSION = 1.6.1
+OPENFST_VERSION = 1.6.2
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10600)","1")

From 9a61b88323776ff020b09f07cfb7d5974f32a73e Mon Sep 17 00:00:00 2001
From: Vijayaditya Peddinti <vijayaditya@users.noreply.github.com>
Date: Thu, 16 Mar 2017 15:23:31 -0700
Subject: [PATCH 190/213] [egs] swbd/chain : added blstm script using
 fast-LSTM; added BLSTM+TDNN script. (#1497)

---
 egs/swbd/s5c/local/chain/run_blstm.sh         |   2 +-
 egs/swbd/s5c/local/chain/run_tdnn_blstm.sh    |   1 +
 .../s5c/local/chain/tuning/run_blstm_6k.sh    | 240 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_blstm_1a.sh   | 242 ++++++++++++++++++
 4 files changed, 484 insertions(+), 1 deletion(-)
 create mode 120000 egs/swbd/s5c/local/chain/run_tdnn_blstm.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh

diff --git a/egs/swbd/s5c/local/chain/run_blstm.sh b/egs/swbd/s5c/local/chain/run_blstm.sh
index 0160247619f..637c747ee7b 120000
--- a/egs/swbd/s5c/local/chain/run_blstm.sh
+++ b/egs/swbd/s5c/local/chain/run_blstm.sh
@@ -1 +1 @@
-tuning/run_blstm_6j.sh
\ No newline at end of file
+tuning/run_blstm_6k.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_blstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_blstm.sh
new file mode 120000
index 00000000000..5a43dcd340e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_blstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_blstm_1a.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
new file mode 100755
index 00000000000..1e673f8e01a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+
+# 6k is same as 6j, but with the fast lstm layers
+
+# local/chain/compare_wer_general.sh blstm_6j_sp blstm_6k_sp
+# System                blstm_6j_sp blstm_6k_sp
+# WER on train_dev(tg)      13.80     13.25
+# WER on train_dev(fg)      12.64     12.27
+# WER on eval2000(tg)        15.6      15.7
+# WER on eval2000(fg)        14.2      14.5
+# Final train prob         -0.055    -0.052
+# Final valid prob         -0.077    -0.080
+# Final train prob (xent)        -0.777    -0.743
+# Final valid prob (xent)       -0.9126   -0.8816
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/blstm_6k  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
new file mode 100755
index 00000000000..60a08136134
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+
+# tdnn_blstm_1a is same as blstm_6k, but with the initial tdnn layers
+# local/chain/compare_wer_general.sh blstm_6l_sp blstm_6k_sp
+# System                blstm_6k_sp tdnn_blstm_6l_sp
+# WER on train_dev(tg)      13.25     12.95
+# WER on train_dev(fg)      12.27     11.98
+# WER on eval2000(tg)        15.7      15.5
+# WER on eval2000(fg)        14.5      14.1
+# Final train prob         -0.052    -0.041
+# Final valid prob         -0.080    -0.072
+# Final train prob (xent)        -0.743    -0.629
+# Final valid prob (xent)       -0.8816   -0.8091
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_blstm_1a  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=blstm1-forward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm1-backward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;

From 68849f47b43c984f9ad873c7747d11a9087a95d9 Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Sat, 18 Mar 2017 10:35:07 +0800
Subject: [PATCH 191/213] [egs] update fisher_swbd recipe (fixes to how things
 are installed).  (#1498)

---
 egs/fisher_swbd/s5/local/fisher_train_lms.sh | 36 ++++--------
 egs/fisher_swbd/s5/path.sh                   |  6 +-
 egs/fisher_swbd/s5/run.sh                    | 61 ++++++++++----------
 egs/wsj/s5/utils/format_lm_sri.sh            | 17 +-----
 4 files changed, 48 insertions(+), 72 deletions(-)

diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms.sh b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
index a9e3fa4566a..64fc188cce2 100755
--- a/egs/fisher_swbd/s5/local/fisher_train_lms.sh
+++ b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
@@ -6,7 +6,7 @@
 . path.sh
 
 text=data/train_all/text
-lexicon=data/local/dict_nosp/lexicon.txt 
+lexicon=data/local/dict_nosp/lexicon.txt
 
 for f in "$text" "$lexicon"; do
   [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
@@ -20,32 +20,16 @@ done
 
 dir=data/local/lm
 mkdir -p $dir
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd ../../../tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
-     wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-mkdir -p $dir
 
+kaldi_lm=`which train_lm.sh`
+if [ ! -x $kaldi_lm ]; then
+  echo "train_lm.sh is not found. Look at tools/extra/install_kaldi_lm.sh"
+  exit 1
+fi
 
 cleantext=$dir/text.no_oov
 
-cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("\n");}' \
   > $cleantext || exit 1;
 
@@ -75,7 +59,7 @@ train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
 train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1;
 
 # note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
+# data/local/lm/3gram-mincount/lm_unpruned.gz
 
 
 exit 0
@@ -97,7 +81,7 @@ cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir
 
 ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
   -map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
-ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout 
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
 
 # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
 # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
@@ -106,7 +90,7 @@ ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
 
 # Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
 # Difference in WSJ must have been due to different treatment of <unk>.
-ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout 
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
 
 # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
 # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
diff --git a/egs/fisher_swbd/s5/path.sh b/egs/fisher_swbd/s5/path.sh
index e14c6074f6b..2d17b17a84a 100755
--- a/egs/fisher_swbd/s5/path.sh
+++ b/egs/fisher_swbd/s5/path.sh
@@ -1,6 +1,6 @@
-export KALDI_ROOT=`pwd`/../../../
-export PWD=`pwd`
-export PATH=$KALDI_ROOT/src/ivectorbin:$PWD/stanford-utils:$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$KALDI_ROOT/tools/kaldi_lm:$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$PATH
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/fisher_swbd/s5/run.sh b/egs/fisher_swbd/s5/run.sh
index 5addefc5fe1..8b1af972647 100755
--- a/egs/fisher_swbd/s5/run.sh
+++ b/egs/fisher_swbd/s5/run.sh
@@ -7,6 +7,7 @@
 mfccdir=mfcc
 set -e
 rescore=true
+
 # prepare fisher data and put it under data/train_fisher
 local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
    /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
@@ -40,7 +41,7 @@ for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
 done
 
 # LM for train_all
-local/fisher_train_lms.sh 
+local/fisher_train_lms.sh
 #local/fisher_create_test_lang.sh
 # Compiles G for trigram LM
 LM=data/local/lm/3gram-mincount/lm_unpruned.gz
@@ -58,7 +59,7 @@ fi
 
 #local/eval2000_data_prep.sh /scail/group/deeplearning/speech/datasets/LDC2002S09/hub5e_00/ /scail/group/deeplearning/speech/datasets/LDC2002T43 || exit 1
 local/eval2000_data_prep.sh /export/corpora/LDC/LDC2002S09/hub5e_00 /export/corpora/LDC/LDC2002T43 || exit 1
- 
+
 #local/rt03_data_prep.sh /scail/group/deeplearning/speech/datasets/rt_03 || exit 1
 local/rt03_data_prep.sh /export/corpora/LDC/LDC2007S10 || exit 1
 
@@ -66,6 +67,12 @@ utils/fix_data_dir.sh data/train_all
 
 
 # Make MFCCs for the training set
+# spread the mfccs over various machines, as this data-set is quite large.
+if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
+  mfcc=$(basename $mfccdir) # in case was absolute pathname (unlikely), get basename.
+  utils/create_split_dir.pl /export/b{05,06,07,08}/$USER/kaldi-data/egs/fisher_swbd/s5/$mfcc/storage \
+    $mfccdir/storage
+fi
 steps/make_mfcc.sh --nj 100 --cmd "$train_cmd" data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
 utils/fix_data_dir.sh data/train_all
 utils/validate_data_dir.sh data/train_all
@@ -111,31 +118,31 @@ utils/data/remove_dup_utts.sh 200 data/train_30k data/train_30k_nodup
 utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup
 utils/data/remove_dup_utts.sh 300 data/train data/train_nodup
 
-# The next commands are not necessary for the scripts to run, but increase 
-# efficiency of data access by putting the mfcc's of the subset 
+# The next commands are not necessary for the scripts to run, but increase
+# efficiency of data access by putting the mfcc's of the subset
 # in a contiguous place in a file.
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_10k_nodup/feats.scp{,.bak} 
+  cp data/train_10k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_10k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \
   && cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_10k_nodup/feats.scp
 )
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_30k_nodup/feats.scp{,.bak} 
+  cp data/train_30k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_30k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_30k_nodup.ark,$mfccdir/kaldi_fish_30k_nodup.scp \
   && cp $mfccdir/kaldi_fish_30k_nodup.scp data/train_30k_nodup/feats.scp
 )
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_100k_nodup/feats.scp{,.bak} 
+  cp data/train_100k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_100k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_100k_nodup.ark,$mfccdir/kaldi_fish_100k_nodup.scp \
   && cp $mfccdir/kaldi_fish_100k_nodup.scp data/train_100k_nodup/feats.scp
 )
 
 # Start training on the Switchboard subset, which has cleaner alignments
 steps/train_mono.sh --nj 3 --cmd "$train_cmd" \
-  data/train_10k_nodup data/lang_nosp exp/mono0a 
+  data/train_10k_nodup data/lang_nosp exp/mono0a
 
 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
    data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
@@ -171,8 +178,8 @@ steps/align_si.sh --nj 50 --cmd "$train_cmd" \
 steps/train_deltas.sh --cmd "$train_cmd" \
     5500 90000 data/train_100k_nodup data/lang_nosp exp/tri1b_ali exp/tri2 || exit 1;
  #used to be 2500 20000 on 30k
-( 
-  graph_dir=exp/tri2/graph_nosp_fsh_sw1_tg 
+(
+  graph_dir=exp/tri2/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri2 $graph_dir || exit 1;
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri2/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
@@ -180,7 +187,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \
    $graph_dir data/rt03 exp/tri2/decode_rt03_nosp_fsh_sw1_tg || exit 1;
 )&
 
-# Train tri3a, the last speaker-independent triphone stage, 
+# Train tri3a, the last speaker-independent triphone stage,
 # on the whole Switchboard training set
 steps/align_si.sh --nj 100 --cmd "$train_cmd" \
    data/train_swbd data/lang_nosp exp/tri2 exp/tri2_ali || exit 1;
@@ -189,8 +196,8 @@ steps/train_deltas.sh --cmd "$train_cmd" \
     11500 200000 data/train_swbd data/lang_nosp exp/tri2_ali exp/tri3a || exit 1;
  #used to be 2500 20000
 
-( 
-  graph_dir=exp/tri3a/graph_nosp_fsh_sw1_tg 
+(
+  graph_dir=exp/tri3a/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3a $graph_dir || exit 1;
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri3a/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
@@ -205,8 +212,8 @@ steps/align_si.sh --nj 100 --cmd "$train_cmd" \
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" \
    11500 400000 data/train_nodup data/lang_nosp exp/tri3a_ali exp/tri3b || exit 1;
-( 
-  graph_dir=exp/tri3b/graph_nosp_fsh_sw1_tg 
+(
+  graph_dir=exp/tri3b/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri3b/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
@@ -232,16 +239,16 @@ if [ $rescore ]; then
   utils/build_const_arpa_lm.sh $LM_fg data/lang data/lang_fsh_sw1_fg
 fi
 
-( 
+(
   graph_dir=exp/tri3b/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri3b/decode_eval2000_fsh_sw1_tg || exit 1;
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/rt03 exp/tri3b/decode_rt03_fsh_sw1_tg || exit 1;
-) &
+)&
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
@@ -250,7 +257,7 @@ steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
 steps/train_sat.sh  --cmd "$train_cmd" \
   11500 800000 data/train_nodup data/lang exp/tri3b_ali  exp/tri4a || exit 1;
 
-( 
+(
   graph_dir=exp/tri4a/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri4a $graph_dir
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -272,11 +279,10 @@ fi
 steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
   data/train_nodup data/lang exp/tri4a exp/tri4a_ali || exit 1;
 
-
 steps/train_sat.sh  --cmd "$train_cmd" \
   11500 1600000 data/train_nodup data/lang exp/tri4a_ali  exp/tri5a || exit 1;
 
-( 
+(
   graph_dir=exp/tri5a/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri5a $graph_dir
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -308,7 +314,7 @@ steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
 steps/train_sat.sh  --cmd "$train_cmd" \
   11500 3200000 data/train_nodup data/lang exp/tri5a_ali  exp/tri6a || exit 1;
 
-( 
+(
   graph_dir=exp/tri6a/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri6a $graph_dir
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -331,9 +337,6 @@ fi
 #steps/align_fmllr.sh --nj 200 --cmd "$train_cmd" \
 #  data/train_nodup data/lang exp/tri6a exp/tri6a_ali || exit 1;
 
-
-# # The following is the current online-nnet2 recipe, with "multi-splice".
+# The following is the current online-nnet2 recipe, with "multi-splice".
 # local/online/run_nnet2_ms.sh
 local/online/run_nnet2_ms.sh
-
-
diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh
index 1acacf7ae89..4ef31d925ca 100755
--- a/egs/wsj/s5/utils/format_lm_sri.sh
+++ b/egs/wsj/s5/utils/format_lm_sri.sh
@@ -61,20 +61,9 @@ done
 
 loc=`which change-lm-vocab`
 if [ -z $loc ]; then
-  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
-    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
-  else
-    sdir=`pwd`/../../../tools/srilm/bin/i686
-  fi
-  if [ -f $sdir/../change-lm-vocab ]; then
-    echo Using SRILM tools from $sdir
-    export PATH=$PATH:$sdir:$sdir/..
-  else
-    echo You appear to not have SRILM tools installed, either on your path,
-    echo or installed in $sdir.  cd to ../../../tools and run
-    echo extras/install_srilm.sh.
-    exit 1
-  fi
+  echo You appear to not have SRILM tools installed.
+  echo cd to $KALDI_ROOT/tools and run extras/install_srilm.sh.
+  exit 1
 fi
 
 echo "Converting '$lm' to FST"

From bef410c1e35381f58d3cdeb1d7f575473505aedf Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Sun, 19 Mar 2017 14:01:27 -0500
Subject: [PATCH 192/213] [src] sort cuda kernel function declarations to make
 searching easier. (#1501)

---
 src/cudamatrix/cu-kernels-ansi.h | 1174 ++++++++---------
 src/cudamatrix/cu-kernels.h      | 2042 +++++++++++++++---------------
 2 files changed, 1557 insertions(+), 1659 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 204aa859bdf..a69246a339a 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -30,691 +30,641 @@
 #if HAVE_CUDA == 1
 extern "C" {
 
-/*********************************************************
- * int32 CUDA kernel calls (no template wrapper)
- */
-void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
-                          MatrixDim d);
-void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
-                    MatrixDim d);
-
-/*********************************************************
- * float CUDA kernel calls
- */
-
-/*
- * CuMatrix
- */
-void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
-void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
-void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
-                            MatrixDim mat_dim, const float *vec,
-                            const float *mat2, int mat2_row_stride,
-                            int mat2_col_stride, float beta);
-void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
-                              MatrixDim dmat);
-void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
-                               MatrixDim dmat);
-void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
-                        MatrixDim dmat);
-void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
-                         MatrixDim dmat);
-void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
-void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                         bool include_sign, MatrixDim d);
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                       MatrixDim d);
-void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                     int src_stride);
-void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
-void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                     int src_stride);
-void cudaF_copy_rows_direct(dim3 Gr, dim3 Bl, float* dst,
-                            const float* const * src, MatrixDim dst_dim);
-void cudaF_copy_to_rows_direct(dim3 Gr, dim3 Bl, float* const * dst,
-                               const float* src, MatrixDim src_dim);
-void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
+void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
-void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                           const float* const * src, MatrixDim dst_dim);
-void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
-                              const float* src, MatrixDim src_dim);
-void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                         MatrixDim d);
-void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
-void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
+                               const double* M, const int strid_M,
+                               const double* N, const MatrixDim dim_N,
+                               const double beta, double* v);
+void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
+                               const float* M, const int strid_M,
+                               const float* N, const MatrixDim dim_N,
+                               const float beta, float* v);
+void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
+                                const double* M, const MatrixDim dim_M,
+                                const double* N, const int stride_N,
+                                const double beta, double* v);
+void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
+                                const float* M, const MatrixDim dim_M,
+                                const float* N, const int stride_N,
+                                const float beta, float* v);
+void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
+                                const double* M, const int strid_M,
+                                const double* N, const MatrixDim dim_N,
+                                const double beta, double* v);
+void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
+                                const float* M, const int strid_M,
+                                const float* N, const MatrixDim dim_N,
+                                const float beta, float* v);
+void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
 void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
-void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
-void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                            MatrixDim mat_dim, const double *vec,
+                            const double *mat2, int mat2_row_stride,
+                            int mat2_col_stride, double beta);
+void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
+                            MatrixDim mat_dim, const float *vec,
+                            const float *mat2, int mat2_row_stride,
+                            int mat2_col_stride, float beta);
+void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
 void cudaF_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
-void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec,
-                    const float alpha, int dim);
-void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
-void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
-void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
-void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                        MatrixDim dst_d, int src_stride);
-void cudaF_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                        MatrixDim dst_d, int src_stride);
-void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d,
-               int src_stride);
-void cudaF_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
-               MatrixDim mat_d, int other_stride);
-void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                        MatrixDim d);
-void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                        MatrixDim d);
-void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
-                              MatrixDim d, int src_stride, int group_size);
-void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
-                            const float *ov, const float* od, MatrixDim id_dim,
-                            int iv_stride, int ov_stride, int od_stride,
-                            int group_size, float power);
-void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
-                                const float *x2, MatrixDim y_dim, int x1_stride,
-                                int x2_stride, int group_size);
-void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div,
-                        MatrixDim d);
-void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst,
-                   MatrixDim d, int src_stride, int A_trans);
+void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
+                            const double *Adata, int A_num_rows, int A_num_cols,
+                            int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data,
+                            int B_num_blocks, double alpha, double beta,
+                            int B_trans);
+void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
+                            const float *Adata, int A_num_rows, int A_num_cols,
+                            int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data,
+                            int B_num_blocks, float alpha, float beta,
+                            int B_trans);
+void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src,
+                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
+                          double *dst, MatrixDim d, int src_stride,
+                          int A_trans);
 void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
                           int32_cuda num_row_blocks, int32_cuda num_col_blocks,
                           float *dst, MatrixDim d, int src_stride, int A_trans);
-void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
-                               const float *C, float *dst, MatrixDim d,
-                               int stride_a, int stride_b, int stride_c);
-void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col,
-                           float beta, float *dst, MatrixDim d);
-void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row,
-                           float beta, float *dst, MatrixDim d);
+void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                            MatrixDim mat_dim, const double *mat2,
+                            int mat2_row_stride, int mat2_col_stride,
+                            const double *vec, double beta);
 void cudaF_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
                             MatrixDim mat_dim, const float *mat2,
                             int mat2_row_stride, int mat2_col_stride,
                             const float *vec, float beta);
+void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
+                   double *dst, MatrixDim d, int src_stride, int A_trans);
+void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst,
+                   MatrixDim d, int src_stride, int A_trans);
+void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
+                                const double *srcA_data,
+                                const double *srcB_data, MatrixDim dim,
+                                int srcA_stride, int srcB_stride, double alpha,
+                                double beta);
 void cudaF_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
                                 const float *srcA_data, const float *srcB_data,
                                 MatrixDim dim, int srcA_stride, int srcB_stride,
                                 float alpha, float beta);
-/*
- * CuVector
- */
-void cudaF_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                        const MatrixDim d);
-void cudaF_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                        const MatrixDim d);
-void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                        const MatrixDim d);
-void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
-                         float changed);
-void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a,
-                           float param_1, float param_2, float param_3,
-                           int* flag, int dim);
-void cublas_copy_kaldi_fd(int Gr, int Bl, int n, const float* x, int incx,
-                          double* y, int incy);
-void cublas_copy_kaldi_df(int Gr, int Bl, int n, const double* x, int incx,
-                          float* y, int incy);
-void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
-void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
-void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
-                   int inc);
-void cudaF_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
-                   int inc);
-void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B,
-                               MatrixDim dA, int B_stride, float* value);
-void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
-                         MatrixDim dA, int B_stride, float* value);
-void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
-                                const float* M, const MatrixDim dim_M,
-                                const float* N, const int stride_N,
-                                const float beta, float* v);
-void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
-                                const float* M, const int strid_M,
-                                const float* N, const MatrixDim dim_N,
-                                const float beta, float* v);
-void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
-                               const float* M, const int strid_M,
-                               const float* N, const MatrixDim dim_N,
-                               const float beta, float* v);
+void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                          const double *src_data, MatrixDim src_dim,
+                          const Int32Pair *indexes);
+void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                          const float *src_data, MatrixDim src_dim,
+                          const Int32Pair *indexes);
+void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride);
+void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride);
+void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                           const double* const * src, MatrixDim dst_dim);
+void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                           const float* const * src, MatrixDim dst_dim);
+void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, double alpha,
+                              double* const * dst, const double* src,
+                              MatrixDim src_dim);
+void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
+                              const float* src, MatrixDim src_dim);
+void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
+                    const double alpha, int dim);
+void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec,
+                    const float alpha, int dim);
+void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col,
+                           double beta, double *dst, MatrixDim d);
+void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col,
+                           float beta, float *dst, MatrixDim d);
+void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row,
+                           double beta, double *dst, MatrixDim d);
+void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row,
+                           float beta, float *dst, MatrixDim d);
+void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
+                       const double* y, double beta, int dim);
 void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x,
                        const float* y, float beta, int dim);
-void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                const float* mat, MatrixDim dmat, int dim);
-void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                const float* mat, MatrixDim dmat, int dim);
-void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
-void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
-                                     const float *src, int dim);
-void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
-                           float* num, int dim);
-void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val,
-                             float* num, int dim);
-void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
-void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
-void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
-void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
-// Note: B_trans is nonzero if B is transposed.
-void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
-                            const float *Adata, int A_num_rows, int A_num_cols,
-                            int A_row_stride, int A_col_stride,
-                            const CuBlockMatrixData *B_cu_data,
-                            int B_num_blocks, float alpha, float beta,
-                            int B_trans);
-void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
-                             int num_blocks, const float *C_data,
-                             int C_num_cols, int C_row_stride, int C_col_stride,
-                             const float *D_data, int D_row_stride,
-                             int D_col_stride, float alpha, float beta);
-/*
- * cu::
- */
-void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
-                          MatrixDim d, int src_stride);
-void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
-                              MatrixDim y_dim, int x_stride);
-void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                      int src_stride);
-void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                       int src_stride, int group_size, float power);
-void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
-                             const float *x, MatrixDim x_d, float tartget_rms,
-                             bool add_log_stddev);
-void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
-                            MatrixDim d, int src_stride, int group_size,
-                            float power);
-void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                     int src_stride, int group_size);
-void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                   int src_stride);
-void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
+                         MatrixDim d);
+void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
+                         MatrixDim d);
+void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
+                       MatrixDim d);
+void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
+                       MatrixDim d);
+void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
+void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
+void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
+                         bool include_sign, MatrixDim d);
+void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
+                         bool include_sign, MatrixDim d);
+void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
+void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
+void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
+                             int num_blocks, const double *C_data,
+                             int C_num_cols, int C_row_stride, int C_col_stride,
+                             const double *D_data, int D_row_stride,
+                             int D_col_stride, double alpha, double beta);
+void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
+                             int num_blocks, const float *C_data,
+                             int C_num_cols, int C_row_stride, int C_col_stride,
+                             const float *D_data, int D_row_stride,
+                             int D_col_stride, float alpha, float beta);
+void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1,
+                                const double *x2, MatrixDim y_dim,
+                                int x1_stride, int x2_stride, int group_size);
+void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
+                                const float *x2, MatrixDim y_dim, int x1_stride,
+                                int x2_stride, int group_size);
+void cudaD_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int s,
+                          const double* z, MatrixDim d, double* z2,
+                          MatrixDim d2, double* t);
+void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s,
+                          const float* z, MatrixDim d, float* z2, MatrixDim d2,
+                          float* t);
+void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                const double* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                const float* mat, MatrixDim dmat, int dim);
+void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                const double* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                const float* mat, MatrixDim dmat, int dim);
+void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride);
-void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                        const float *y, MatrixDim d, int e_stride,
-                        int y_stride);
-void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                int src_stride);
-void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                     const float *y, MatrixDim d, int e_stride, int y_stride);
-void cudaF_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           MatrixDim d, int src_stride, const float *a,
-                           const float *b);
-void cudaF_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                                const float *y, MatrixDim d, int e_stride,
-                                int y_stride, const float *a, const float *b);
-
-void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1,
-                         float lr, MatrixDim d, int stride_grad);
-void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val,
-                           int32_cuda *vec_id, MatrixDim d);
-void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                     float *mat_net_out, float *vec_log_post, MatrixDim d);
-void cudaF_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
-                        const float* value, const int value_stride,
-                        const float* diff, const int diff_stride);
-void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
+void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                     int src_stride);
+void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                              MatrixDim d_out, const double *v_in);
+void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
                               const float *v_in);
-
-void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
-                     const int32_cuda *copy_from, MatrixDim d_out,
-                     MatrixDim d_in);
-void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
-                  const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
-void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
-                            const float* out_value, const int out_value_stride,
-                            const float* out_deriv, const int out_deriv_stride,
-                            float* in_deriv);
-void cudaF_one(int Gr, int Bl, float* x, int dim);
+void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
+                const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
 void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
                 const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
+                           const double* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
+                                 const double* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out,
+                           const float* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                 const float* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out,
+                           const double* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
+                                 const double* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out,
+                           const float* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                 const float* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out,
+                            const MatrixElement<double>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                  const MatrixElement<double>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out,
+                            const MatrixElement<float>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                  const MatrixElement<float>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out,
+                            const MatrixElement<double>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                  const MatrixElement<double>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out,
+                            const MatrixElement<float>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                  const MatrixElement<float>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
+                        MatrixDim d_out);
 void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
                         MatrixDim d_out);
-void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
-                      MatrixDim d_in);
-void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
-                      MatrixDim d_in);
-void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
-                     MatrixDim d_in);
-void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                               float alpha, MatrixElement<float>* x,
-                               int num_elements);
-void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                     float alpha, const Int32Pair* indices,
-                                     const float* x, int s, float* data);
-void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s,
-                          const float* z, MatrixDim d, float* z2, MatrixDim d2,
-                          float* t);
-void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T,
-                      MatrixDim tdim, float *S, MatrixDim sdim);
-void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                             const float *src_data, MatrixDim src_dim,
-                             const Int32Pair *indices);
-void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                          const float *src_data, MatrixDim src_dim,
-                          const Int32Pair *indexes);
-void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
-                         const Int32Pair *indices, int indices_size,
-                         float *output);
-
-void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
-                              const float *mat2, float *mask,
-                              MatrixDim mat1_dim, int mat2_stride,
-                              int mask_stride);
-
-/*********************************************************
- * double CUDA kernel calls
- */
-
-/*
- * CuMatrix
- */
-void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
-void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
-void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                            MatrixDim mat_dim, const double *vec,
-                            const double *mat2, int mat2_row_stride,
-                            int mat2_col_stride, double beta);
-void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B,
-                              MatrixDim dmat);
-void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
-                               MatrixDim dmat);
 void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
                         MatrixDim dmat);
 void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
                          MatrixDim dmat);
-void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
-void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                         bool include_sign, MatrixDim d);
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                       MatrixDim d);
-void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
+                         MatrixDim dmat);
+void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
+                        MatrixDim dmat);
+void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B,
+                              MatrixDim dmat);
+void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
+                               MatrixDim dmat);
+void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
+                               MatrixDim dmat);
+void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
+                              MatrixDim dmat);
+void cublas_copy_kaldi_df(int Gr, int Bl, int n, const double* x, int incx,
+                          float* y, int incy);
+void cublas_copy_kaldi_fd(int Gr, int Bl, int n, const float* x, int incx,
+                          double* y, int incy);
+void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
+void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
+void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride);
-void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                    int src_stride);
-void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
+void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride);
 void cudaD_copy_rows_direct(dim3 Gr, dim3 Bl, double* dst,
                             const double* const * src, MatrixDim dst_dim);
+void cudaF_copy_rows_direct(dim3 Gr, dim3 Bl, float* dst,
+                            const float* const * src, MatrixDim dst_dim);
+void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                              MatrixDim d_out, const double *v_in);
+void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
+                              const float *v_in);
 void cudaD_copy_to_rows_direct(dim3 Gr, dim3 Bl, double* const * dst,
                                const double* src, MatrixDim src_dim);
-void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                    const double* src, const MatrixIndexT_cuda* reorder,
-                    MatrixDim dst_dim, int src_stride);
-void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                           const double* const * src, MatrixDim dst_dim);
-void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, double alpha,
-                              double* const * dst, const double* src,
-                              MatrixDim src_dim);
-void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
-                         MatrixDim d);
-void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
-void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
-void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
-void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
-void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
-void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
-                    const double alpha, int dim);
-void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value,
-                             int dim);
-void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
-void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
-void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
-                        MatrixDim dst_d, int src_stride);
+void cudaF_copy_to_rows_direct(dim3 Gr, dim3 Bl, float* const * dst,
+                               const float* src, MatrixDim src_dim);
+void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
+void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
+void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id, const double *iv,
+                            const double *ov, const double* od,
+                            MatrixDim id_dim, int iv_stride, int ov_stride,
+                            int od_stride, int group_size, double power);
+void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
+                            const float *ov, const float* od, MatrixDim id_dim,
+                            int iv_stride, int ov_stride, int od_stride,
+                            int group_size, float power);
+void cudaD_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
+                            const double* out_value, const int out_value_stride,
+                            const double* out_deriv, const int out_deriv_stride,
+                            double* in_deriv);
+void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
+                            const float* out_value, const int out_value_stride,
+                            const float* out_deriv, const int out_deriv_stride,
+                            float* in_deriv);
+void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const double* input,
+                                  const int in_stride, const double* params,
+                                  const int params_stride,
+                                  const double* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const double* self_repair_config,
+                                  double count, double* input_deriv,
+                                  const int input_deriv_stride,
+                                  double* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  double* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride);
+void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const float* input,
+                                  const int in_stride, const float* params,
+                                  const int params_stride,
+                                  const float* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const float* self_repair_config, double count,
+                                  float* input_deriv,
+                                  const int input_deriv_stride,
+                                  float* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  float* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride);
+void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                  int id_stride, const double *iv,
+                                  MatrixDim iv_dim, const double* od,
+                                  int od_stride, double target_rms,
+                                  bool add_log_stddev);
+void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                  int id_stride, const float *iv,
+                                  MatrixDim iv_dim, const float* od,
+                                  int od_stride, float target_rms,
+                                  bool add_log_stddev);
+void cudaD_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                                const double *y, MatrixDim d, int e_stride,
+                                int y_stride, const double *a, const double *b);
+void cudaF_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                                const float *y, MatrixDim d, int e_stride,
+                                int y_stride, const float *a, const float *b);
+void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                        const double *y, MatrixDim d, int e_stride,
+                        int y_stride);
+void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                        const float *y, MatrixDim d, int e_stride,
+                        int y_stride);
+void cudaD_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
+                        const double* value, const int value_stride,
+                        const double* diff, const int diff_stride);
+void cudaF_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
+                        const float* value, const int value_stride,
+                        const float* diff, const int diff_stride);
+void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                     const double *y, MatrixDim d, int e_stride, int y_stride);
+void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                     const float *y, MatrixDim d, int e_stride, int y_stride);
+void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                     double *mat_net_out, double *vec_log_post, MatrixDim d);
+void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                     float *mat_net_out, float *vec_log_post, MatrixDim d);
 void cudaD_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
                         MatrixDim dst_d, int src_stride);
+void cudaF_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                        MatrixDim dst_d, int src_stride);
+void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div,
+                        MatrixDim d);
+void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div,
+                        MatrixDim d);
+void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
+                              const double *mat2, double *mask,
+                              MatrixDim mat1_dim, int mat2_stride,
+                              int mask_stride);
+void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
+                              const float *mat2, float *mask,
+                              MatrixDim mat1_dim, int mat2_stride,
+                              int mask_stride);
+void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val,
+                           int32_cuda *vec_id, MatrixDim d);
+void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val,
+                           int32_cuda *vec_id, MatrixDim d);
+void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                     int src_stride, int group_size);
+void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                     int src_stride, int group_size);
+void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                       MatrixDim d, int src_stride, int group_size,
+                       double power);
+void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                       int src_stride, int group_size, float power);
+void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                            MatrixDim d, int src_stride, int group_size,
+                            double power);
+void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+                            MatrixDim d, int src_stride, int group_size,
+                            float power);
+void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                     int src_stride);
+void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                     int src_stride);
+void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
+                    MatrixDim d);
+void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
+                          MatrixDim d);
+void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
+void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
+void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
+                              MatrixDim y_dim, int x_stride);
+void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
+                              MatrixDim y_dim, int x_stride);
+void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
+                             const int in_stride, const double* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             double* out);
+void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
+                             const int in_stride, const float* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             float* out);
+void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                               double alpha, MatrixElement<double>* x,
+                               int num_elements);
+void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                               float alpha, MatrixElement<float>* x,
+                               int num_elements);
+void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                     double alpha, const Int32Pair* indices,
+                                     const double* x, int s, double* data);
+void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                     float alpha, const Int32Pair* indices,
+                                     const float* x, int s, float* data);
+void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         double *output);
+void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         float *output);
 void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d,
                int src_stride);
+void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d,
+               int src_stride);
+void cudaD_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                        const MatrixDim d);
+void cudaF_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                        const MatrixDim d);
 void cudaD_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
                MatrixDim mat_d, int other_stride);
+void cudaF_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
+               MatrixDim mat_d, int other_stride);
+void cudaD_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                        const MatrixDim d);
+void cudaF_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                        const MatrixDim d);
 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
                         MatrixDim d);
-void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
+void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
                         MatrixDim d);
+void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
+                        MatrixDim dst_d, int src_stride);
+void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                        MatrixDim dst_d, int src_stride);
 void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x,
                               MatrixDim d, int src_stride, int group_size);
-void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id, const double *iv,
-                            const double *ov, const double* od,
-                            MatrixDim id_dim, int iv_stride, int ov_stride,
-                            int od_stride, int group_size, double power);
-void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1,
-                                const double *x2, MatrixDim y_dim,
-                                int x1_stride, int x2_stride, int group_size);
-void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div,
+void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
+                              MatrixDim d, int src_stride, int group_size);
+void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
+                        MatrixDim d);
+void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
                         MatrixDim d);
-void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
-                   double *dst, MatrixDim d, int src_stride, int A_trans);
-void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src,
-                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
-                          double *dst, MatrixDim d, int src_stride,
-                          int A_trans);
-void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
-                               const double *B, const double *C, double *dst,
-                               MatrixDim d, int stride_a, int stride_b,
-                               int stride_c);
-void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col,
-                           double beta, double *dst, MatrixDim d);
-void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row,
-                           double beta, double *dst, MatrixDim d);
-void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                            MatrixDim mat_dim, const double *mat2,
-                            int mat2_row_stride, int mat2_col_stride,
-                            const double *vec, double beta);
-void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
-                                const double *srcA_data,
-                                const double *srcB_data, MatrixDim dim,
-                                int srcA_stride, int srcB_stride, double alpha,
-                                double beta);
-
-/*
- * CuVector
- */
-void cudaD_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                        const MatrixDim d);
-void cudaD_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                        const MatrixDim d);
-void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                        const MatrixDim d);
-void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig,
-                         double changed);
-void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a,
-                           double param_1, double param_2, double param_3,
-                           int* flag, int dim);
-void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
-                            int dim);
-void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
-void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim,
-                   int inc);
-void cudaD_vec_max(int Gr, int Bl, const double* v, double* value, int dim,
-                   int inc);
-void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
-                               const double* B, MatrixDim dA, int B_stride,
-                               double* value);
-void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
-                         MatrixDim dA, int B_stride, double* value);
-void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
-                                const double* M, const MatrixDim dim_M,
-                                const double* N, const int stride_N,
-                                const double beta, double* v);
-void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
-                                const double* M, const int strid_M,
-                                const double* N, const MatrixDim dim_N,
-                                const double beta, double* v);
-void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
-                               const double* M, const int strid_M,
-                               const double* N, const MatrixDim dim_N,
-                               const double beta, double* v);
-void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
-                       const double* y, double beta, int dim);
-void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                const double* mat, MatrixDim dmat, int dim);
-void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                const double* mat, MatrixDim dmat, int dim);
-void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
-void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
-                                     const double *src, int dim);
-void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
-                           float* num, int dim);
-void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val,
-                             float* num, int dim);
-void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
-void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
-void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
-void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
-// note: B_trans is nonzero if B is tranposed.
-void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
-                            const double *Adata, int A_num_rows, int A_num_cols,
-                            int A_row_stride, int A_col_stride,
-                            const CuBlockMatrixData *B_cu_data,
-                            int B_num_blocks, double alpha, double beta,
-                            int B_trans);
-void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
-                             int num_blocks, const double *C_data,
-                             int C_num_cols, int C_row_stride, int C_col_stride,
-                             const double *D_data, int D_row_stride,
-                             int D_col_stride, double alpha, double beta);
-
-/*
- * cu::
- */
-void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
-                          MatrixDim d, int src_stride);
-void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
-                              MatrixDim y_dim, int x_stride);
-void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                      int src_stride);
-void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                       MatrixDim d, int src_stride, int group_size,
-                       double power);
 void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride,
                              const double *x, MatrixDim x_d, double tartget_rms,
                              bool add_log_stddev);
-void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                            MatrixDim d, int src_stride, int group_size,
-                            double power);
-void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                     int src_stride, int group_size);
-void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                   int src_stride);
-void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                     int src_stride);
-void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                        const double *y, MatrixDim d, int e_stride,
-                        int y_stride);
-void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                int src_stride);
-void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                     const double *y, MatrixDim d, int e_stride, int y_stride);
+void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
+                             const float *x, MatrixDim x_d, float tartget_rms,
+                             bool add_log_stddev);
+void cudaD_one(int Gr, int Bl, double* x, int dim);
+void cudaF_one(int Gr, int Bl, float* x, int dim);
 void cudaD_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
                            MatrixDim d, int src_stride, const double *a,
                            const double *b);
-void cudaD_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                                const double *y, MatrixDim d, int e_stride,
-                                int y_stride, const double *a, const double *b);
-
-void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1,
-                         double lr, MatrixDim d, int stride_grad);
-void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val,
-                           int32_cuda *vec_id, MatrixDim d);
-void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                     double *mat_net_out, double *vec_log_post, MatrixDim d);
-void cudaD_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
-                        const double* value, const int value_stride,
-                        const double* diff, const int diff_stride);
-void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                              MatrixDim d_out, const double *v_in);
-
+void cudaF_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           MatrixDim d, int src_stride, const float *a,
+                           const float *b);
 void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
                      const int32_cuda *copy_from, MatrixDim d_out,
                      MatrixDim d_in);
+void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
+                     const int32_cuda *copy_from, MatrixDim d_out,
+                     MatrixDim d_in);
+void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1,
+                         double lr, MatrixDim d, int stride_grad);
+void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1,
+                         float lr, MatrixDim d, int stride_grad);
+void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig,
+                         double changed);
+void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
+                         float changed);
+void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value,
+                             int dim);
+void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
+void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a,
+                           double param_1, double param_2, double param_3,
+                           int* flag, int dim);
+void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a,
+                           float param_1, float param_2, float param_3,
+                           int* flag, int dim);
+void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
+void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
+void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
+void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
+void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
+                               const double *B, const double *C, double *dst,
+                               MatrixDim d, int stride_a, int stride_b,
+                               int stride_c);
+void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
+                               const float *C, float *dst, MatrixDim d,
+                               int stride_a, int stride_b, int stride_c);
+void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                   int src_stride);
+void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                   int src_stride);
+void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                      int src_stride);
+void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                      int src_stride);
+void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
+                          MatrixDim d, int src_stride);
+void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
+                          MatrixDim d, int src_stride);
 void cudaD_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
                   const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
-void cudaD_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
-                            const double* out_value, const int out_value_stride,
-                            const double* out_deriv, const int out_deriv_stride,
-                            double* in_deriv);
-void cudaD_one(int Gr, int Bl, double* x, int dim);
-void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
-                const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
-void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
-                        MatrixDim d_out);
+void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
+                  const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
+void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                             const double *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);
+void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                             const float *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);
+void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                        const MatrixDim d);
+void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                        const MatrixDim d);
+void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
+                      const double* T, MatrixDim tdim, double *S,
+                      MatrixDim sdim);
+void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T,
+                      MatrixDim tdim, float *S, MatrixDim sdim);
 void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
                       MatrixDim d_in);
-void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
+void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
                       MatrixDim d_in);
 void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
                      MatrixDim d_in);
-
-// some mostly mixed-type kernels.
-void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out,
-                           const float* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out,
-                           const float* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out,
-                           const double* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
-                           const double* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                 const float* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                 const float* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
-                                 const double* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
-                                 const double* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-
-void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out,
-                            const MatrixElement<float>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out,
-                            const MatrixElement<double>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out,
-                            const MatrixElement<float>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out,
-                            const MatrixElement<double>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                  const MatrixElement<float>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                  const MatrixElement<double>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                  const MatrixElement<float>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                  const MatrixElement<double>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-
-void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
-                          const MatrixElement<float>* smat_in,
-                          MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                          float* trace_vec_out);
-void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                                float* trace_vec_out);
+void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
+                     MatrixDim d_in);
+void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
+                      MatrixDim d_in);
+void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
+                      MatrixDim d_in);
+void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                int src_stride);
+void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                int src_stride);
+void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
+void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
+void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
+                         MatrixDim dA, int B_stride, double* value);
+void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
+                         MatrixDim dA, int B_stride, float* value);
+void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
+                               const double* B, MatrixDim dA, int B_stride,
+                               double* value);
+void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B,
+                               MatrixDim dA, int B_stride, float* value);
 void cudaD_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in,
                           const MatrixElement<double>* smat_in,
                           MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
                           double* trace_vec_out);
+void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
+                          const MatrixElement<float>* smat_in,
+                          MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                          float* trace_vec_out);
 void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
                                 const MatrixElement<double>* smat_in,
                                 MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
                                 double* trace_vec_out);
-
-void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                               double alpha, MatrixElement<double>* x,
-                               int num_elements);
-void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                     double alpha, const Int32Pair* indices,
-                                     const double* x, int s, double* data);
-void cudaD_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int s,
-                          const double* z, MatrixDim d, double* z2,
-                          MatrixDim d2, double* t);
-
-void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
-                      const double* T, MatrixDim tdim, double *S,
-                      MatrixDim sdim);
-void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                             const double *src_data, MatrixDim src_dim,
-                             const Int32Pair *indices);
-void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                          const double *src_data, MatrixDim src_dim,
-                          const Int32Pair *indexes);
-void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
-                         const Int32Pair *indices, int indices_size,
-                         double *output);
-
-void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
-                              const double *mat2, double *mask,
-                              MatrixDim mat1_dim, int mat2_stride,
-                              int mask_stride);
-
-void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
-                             const int in_stride, const double* params,
-                             const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
-                             double* out);
-void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
-                             const int in_stride, const float* params,
-                             const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
-                             float* out);
-void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                  const int num_rows, const double* input,
-                                  const int in_stride, const double* params,
-                                  const int params_stride,
-                                  const double* output_deriv,
-                                  const int output_deriv_stride,
-                                  const double* deriv_sum_in,
-                                  const int deriv_sum_in_stride,
-                                  const double* self_repair_config,
-                                  double count, double* input_deriv,
-                                  const int input_deriv_stride,
-                                  double* params_deriv,
-                                  const int params_deriv_stride,
-                                  double* value_sum_out,
-                                  const int value_sum_out_stride,
-                                  double* deriv_sum_out,
-                                  const int deriv_sum_out_stride,
-                                  double* self_repair_sum_out,
-                                  const int self_repair_sum_out_stride);
-void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                  const int num_rows, const float* input,
-                                  const int in_stride, const float* params,
-                                  const int params_stride,
-                                  const float* output_deriv,
-                                  const int output_deriv_stride,
-                                  const double* deriv_sum_in,
-                                  const int deriv_sum_in_stride,
-                                  const float* self_repair_config, double count,
-                                  float* input_deriv,
-                                  const int input_deriv_stride,
-                                  float* params_deriv,
-                                  const int params_deriv_stride,
-                                  double* value_sum_out,
-                                  const int value_sum_out_stride,
-                                  double* deriv_sum_out,
-                                  const int deriv_sum_out_stride,
-                                  float* self_repair_sum_out,
-                                  const int self_repair_sum_out_stride);
-
-void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                              MatrixDim d_out, const double *v_in);
-void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
-                              const float *v_in);
-
-void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
-                                  int id_stride, const float *iv,
-                                  MatrixDim iv_dim, const float* od,
-                                  int od_stride, float target_rms,
-                                  bool add_log_stddev);
-void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
-                                  int id_stride, const double *iv,
-                                  MatrixDim iv_dim, const double* od,
-                                  int od_stride, double target_rms,
-                                  bool add_log_stddev);
+void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                                float* trace_vec_out);
+void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val,
+                             float* num, int dim);
+void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val,
+                             float* num, int dim);
+void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
+void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
+void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
+                           float* num, int dim);
+void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
+                           float* num, int dim);
+void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
+void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
+void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
+                                     const double *src, int dim);
+void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
+                                     const float *src, int dim);
+void cudaD_vec_max(int Gr, int Bl, const double* v, double* value, int dim,
+                   int inc);
+void cudaF_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
+                   int inc);
+void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim,
+                   int inc);
+void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
+                   int inc);
+void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
+                            int dim);
+void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
+void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
+void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
+void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
+void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
 
 } // extern "C"
 
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 85b5294fc59..87aaf096570 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -38,15 +38,66 @@
 
 namespace kaldi {
 
-/*
- * CuMatrix
- */
-
-inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
-  cudaF_copy_upp_low(Gr, Bl, A, dimA);
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                          int src_stride) {
+  cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
-  cudaF_copy_low_upp(Gr, Bl, A, dimA);
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                          int src_stride) {
+  cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
+                                     const double* M, const int stride_M,
+                                     const double* N, const MatrixDim dim_N,
+                                     const double beta, double* v) {
+  cudaD_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
+                                     const float* M, const int stride_M,
+                                     const float* N, const MatrixDim dim_N,
+                                     const float beta, float* v) {
+  cudaF_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
+                                      const double* M, const MatrixDim dim_M,
+                                      const double* N, const int stride_N,
+                                      const double beta, double* v) {
+  cudaD_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
+                                      const float* M, const MatrixDim dim_M,
+                                      const float* N, const int stride_N,
+                                      const float beta, float* v) {
+  cudaF_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
+                                      const double* M, const int stride_M,
+                                      const double* N, const MatrixDim dim_N,
+                                      const double beta, double* v) {
+  cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
+                                      const float* M, const int stride_M,
+                                      const float* N, const MatrixDim dim_N,
+                                      const float beta, float* v) {
+  cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value,
+                                 int dim) {
+  cudaD_add_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value,
+                                 int dim) {
+  cudaF_add_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                                  MatrixDim mat_dim, const double *vec,
+                                  const double *mat2, int mat2_row_stride,
+                                  int mat2_col_stride, double beta) {
+  cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
+                         mat2_row_stride, mat2_col_stride, beta);
 }
 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
                                   MatrixDim mat_dim, const float *vec,
@@ -55,273 +106,248 @@ inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
   cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
                          mat2_row_stride, mat2_col_stride, beta);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
-                                    MatrixDim dmat) {
-  cudaF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) {
+  cudaD_add(Gr, Bl, mat, value, d);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
-                                    MatrixDim dmat) {
-  cudaFD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
+  cudaF_add(Gr, Bl, mat, value, d);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
-                              MatrixDim dmat) {
-  cudaF_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
+                                  const double *Adata, int A_num_rows,
+                                  int A_num_cols, int A_row_stride,
+                                  int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data,
+                                  int B_num_blocks, double alpha, double beta,
+                                  int B_trans) {
+  cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
+                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
+                         alpha, beta, B_trans);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
-                              MatrixDim dmat) {
-  cudaFD_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
+                                  const float *Adata, int A_num_rows,
+                                  int A_num_cols, int A_row_stride,
+                                  int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data,
+                                  int B_num_blocks, float alpha, float beta,
+                                  int B_trans) {
+  cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
+                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
+                         alpha, beta, B_trans);
 }
-
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
-                               const double* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
+                                const double *src, int32_cuda num_row_blocks,
+                                int32_cuda num_col_blocks, double *dst,
+                                MatrixDim d, int src_stride, int A_trans) {
+  cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
+                       d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
-                               const float* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
+                                int32_cuda num_row_blocks,
+                                int32_cuda num_col_blocks, float *dst,
+                                MatrixDim d, int src_stride, int A_trans) {
+  cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
+                       d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
-                               const double* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                                  MatrixDim mat_dim, const double *mat2,
+                                  int mat2_row_stride, int mat2_col_stride,
+                                  const double *vec, double beta) {
+  cudaD_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
+                         mat2_col_stride, vec, beta);
 }
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
-                               const float* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
+                                  MatrixDim mat_dim, const float *mat2,
+                                  int mat2_row_stride, int mat2_col_stride,
+                                  const float *vec, float beta) {
+  cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
+                         mat2_col_stride, vec, beta);
 }
-
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                     const double* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
+                         double *dst, MatrixDim d, int src_stride,
+                         int A_trans) {
+  cudaD_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                     const float* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src,
+                         float *dst, MatrixDim d, int src_stride, int A_trans) {
+  cudaF_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                     const double* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
+                                      const double *srcA_data,
+                                      const double *srcB_data, MatrixDim dim,
+                                      int srcA_stride, int srcB_stride,
+                                      double alpha, double beta) {
+  cudaD_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
+                             srcA_stride, srcB_stride, alpha, beta);
 }
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                     const float* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
+                                      const float *srcA_data,
+                                      const float *srcB_data, MatrixDim dim,
+                                      int srcA_stride, int srcB_stride,
+                                      float alpha, float beta) {
+  cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
+                             srcA_stride, srcB_stride, alpha, beta);
 }
-
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_ff(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                                const double *src_data, MatrixDim src_dim,
+                                const Int32Pair *indexes) {
+  cudaD_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
 }
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
-                                const MatrixElement<double>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_fd(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                                const float *src_data, MatrixDim src_dim,
+                                const Int32Pair *indexes) {
+  cudaF_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
 }
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_df(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                          const double* const * src, MatrixDim dst_dim) {
+  cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
 }
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
-                                const MatrixElement<double>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_dd(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                          const double* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
-
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                      const MatrixElement<float>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_ff_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                          const float* const * src, MatrixDim dst_dim) {
+  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
 }
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                      const MatrixElement<double>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_fd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                          const float* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                      const MatrixElement<float>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_df_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, double alpha,
+                             double* const * dst, const double* src,
+                             MatrixDim src_dim) {
+  cudaD_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
 }
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                      const MatrixElement<double>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
+                             const float* src, MatrixDim src_dim) {
+  cudaF_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
 }
-
-inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                                float* trace_vec_out) {
-  cudaF_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                       trace_vec_out);
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
+                          const double alpha, int dim) {
+  cudaD_add_vec2(Gr, Bl, mat, vec, alpha, dim);
 }
-inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
-                                      const MatrixElement<float>* smat_in,
-                                      MatrixDim mat_d_in,
-                                      MatrixIndexT_cuda smat_d_in,
-                                      float* trace_vec_out) {
-  cudaF_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                             trace_vec_out);
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec,
+                          const float alpha, int dim) {
+  cudaF_add_vec2(Gr, Bl, mat, vec, alpha, dim);
 }
-inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in,
-                                const MatrixElement<double>* smat_in,
-                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                                double* trace_vec_out) {
-  cudaD_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                       trace_vec_out);
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha,
+                                 const double *col, double beta, double *dst,
+                                 MatrixDim d) {
+  cudaD_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
 }
-inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
-                                      const MatrixElement<double>* smat_in,
-                                      MatrixDim mat_d_in,
-                                      MatrixIndexT_cuda smat_d_in,
-                                      double* trace_vec_out) {
-  cudaD_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                             trace_vec_out);
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha,
+                                 const float *col, float beta, float *dst,
+                                 MatrixDim d) {
+  cudaF_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
 }
-
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  cudaF_apply_exp(Gr, Bl, mat, d);
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha,
+                                 const double *row, double beta, double *dst,
+                                 MatrixDim d) {
+  cudaD_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
 }
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power,
-                           MatrixDim dim) {
-  cudaF_apply_pow(Gr, Bl, mat, power, dim);
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha,
+                                 const float *row, float beta, float *dst,
+                                 MatrixDim d) {
+  cudaF_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
 }
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                               bool include_sign, MatrixDim dim) {
-  cudaF_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
+inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v,
+                             const double* x, const double* y, double beta,
+                             int dim) {
+  cudaD_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
 }
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
-  cudaF_apply_heaviside(Gr, Bl, mat, dim);
+inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v,
+                             const float* x, const float* y, float beta,
+                             int dim) {
+  cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
 }
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                             MatrixDim dim) {
-  cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
+inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat,
+                               double ceiling_val, MatrixDim dim) {
+  cudaD_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
 }
 inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
                                MatrixDim dim) {
   cudaF_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
 }
-inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                          int src_stride) {
-  cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst,
-                           const float* const * src, MatrixDim dst_dim) {
-  cudaF_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
-}
-inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, float* const * dst,
-                              const float* src, MatrixDim src_dim) {
-  cudaF_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
-}
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                          const float* src, const MatrixIndexT_cuda* reorder,
-                          MatrixDim dst_dim, int src_stride) {
-  cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                          const float* const * src, MatrixDim dst_dim) {
-  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
-}
-inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
-                             const float* src, MatrixDim src_dim) {
-  cudaF_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
-}
-inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) {
-  cudaF_trace(Gr, Bl, mat, value, dim);
-}
-inline void cuda_set_diag(int Gr, int Bl, float* mat, float value,
-                          MatrixDim d) {
-  cudaF_set_diag(Gr, Bl, mat, value, d);
-}
-inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value,
-                                 int dim) {
-  cudaF_set_diag_packed(Gr, Bl, mat, value, dim);
-}
-inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value,
-                                 int dim) {
-  cudaF_add_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
+  cudaD_apply_exp(Gr, Bl, mat, d);
 }
-inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value,
-                           MatrixDim d) {
-  cudaF_set_const(Gr, Bl, mat, value, d);
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
+  cudaF_apply_exp(Gr, Bl, mat, d);
 }
-inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat,
-                                     MatrixDim d) {
-  cudaF_set_zero_above_diag(Gr, Bl, mat, d);
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
+                             MatrixDim dim) {
+  cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);
 }
-inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
-  cudaF_add(Gr, Bl, mat, value, d);
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
+                             MatrixDim dim) {
+  cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
 }
-inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec,
-                          const float alpha, int dim) {
-  cudaF_add_vec2(Gr, Bl, mat, vec, alpha, dim);
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
+  cudaD_apply_heaviside(Gr, Bl, mat, dim);
 }
-inline void cuda_scale_diag_packed(int Gr, int Bl, float* mat, float value,
-                                   int dim) {
-  cudaF_scale_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
+  cudaF_apply_heaviside(Gr, Bl, mat, dim);
 }
-inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
-  cudaF_scale(Gr, Bl, mat, value, d);
+inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) {
+  cudaD_apply_log(Gr, Bl, mat, d);
 }
 inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) {
   cudaF_apply_log(Gr, Bl, mat, d);
 }
-inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                              MatrixDim dst_d, int src_stride) {
-  cudaF_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
+inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
+                               bool include_sign, MatrixDim dim) {
+  cudaD_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
 }
-inline void cuda_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                              MatrixDim dst_d, int src_stride) {
-  cudaF_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
+inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
+                               bool include_sign, MatrixDim dim) {
+  cudaF_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
 }
-inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                     MatrixDim dst_d, int src_stride) {
-  cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power,
+                           MatrixDim dim) {
+  cudaD_apply_pow(Gr, Bl, mat, power, dim);
 }
-inline void cuda_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
-                     MatrixDim mat_d, int other_stride) {
-  cudaF_min(Gr, Bl, mat, other, mat_d, other_stride);
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power,
+                           MatrixDim dim) {
+  cudaF_apply_pow(Gr, Bl, mat, power, dim);
 }
-inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                              MatrixDim d) {
-  cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
+inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha,
+                                const double *x, int incx, double *y,
+                                int incy) {
+  return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                              MatrixDim d) {
-  cudaF_mul_rows_vec(Gr, Bl, mat, scale, d);
+inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha,
+                                const float *x, int incx, float *y, int incy) {
+  return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
-                                    MatrixDim d, int src_stride,
-                                    int group_size) {
-  cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
+                                   CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const double *C_data, int C_num_cols,
+                                   int C_row_stride, int C_col_stride,
+                                   const double *D_data, int D_row_stride,
+                                   int D_col_stride, double alpha,
+                                   double beta) {
+  cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
 }
-
-inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
-                                  const float *ov, const float* od,
-                                  MatrixDim id_dim, int iv_stride,
-                                  int ov_stride, int od_stride, int group_size,
-                                  float power) {
-  cudaF_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
-                         od_stride, group_size, power);
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
+                                   CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const float *C_data, int C_num_cols,
+                                   int C_row_stride, int C_col_stride,
+                                   const float *D_data, int D_row_stride,
+                                   int D_col_stride, float alpha, float beta) {
+  cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
+}
+inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y,
+                                      const double *x1, const double *x2,
+                                      MatrixDim y_dim, int x1_stride,
+                                      int x2_stride, int group_size) {
+  cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
+                             group_size);
 }
 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y,
                                       const float *x1, const float *x2,
@@ -330,280 +356,255 @@ inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y,
   cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
                              group_size);
 }
-inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src,
-                         float *dst, MatrixDim d, int src_stride, int A_trans) {
-  cudaF_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x,
+                                int32 size, const double* z, MatrixDim d,
+                                double* z2, MatrixDim d2, double* t) {
+  cudaD_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
 }
-inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
-                                int32_cuda num_row_blocks,
-                                int32_cuda num_col_blocks, float *dst,
-                                MatrixDim d, int src_stride, int A_trans) {
-  cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
-                       d, src_stride, A_trans);
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x,
+                                int32 size, const float* z, MatrixDim d,
+                                float* z2, MatrixDim d2, float* t) {
+  cudaF_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
 }
-inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A,
-                                     const float *B, const float *C, float *dst,
-                                     MatrixDim d, int stride_a, int stride_b,
-                                     int stride_c) {
-  cudaF_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
-                            stride_c);
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                      const double* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaD_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha,
-                                 const float *col, float beta, float *dst,
-                                 MatrixDim d) {
-  cudaF_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                      const float* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaF_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha,
-                                 const float *row, float beta, float *dst,
-                                 MatrixDim d) {
-  cudaF_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                      const double* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaD_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta,
-                            const float* T, MatrixDim tdim, float *S,
-                            MatrixDim sdim) {
-  cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                      const float* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaF_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
-                                  MatrixDim mat_dim, const float *mat2,
-                                  int mat2_row_stride, int mat2_col_stride,
-                                  const float *vec, float beta) {
-  cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
-                         mat2_col_stride, vec, beta);
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
-                                      const float *srcA_data,
-                                      const float *srcB_data, MatrixDim dim,
-                                      int srcA_stride, int srcB_stride,
-                                      float alpha, float beta) {
-  cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
-                             srcA_stride, srcB_stride, alpha, beta);
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-
-/*
- * CuVector
- */
-inline void cuda_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                              const MatrixDim d) {
-  cudaF_max_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                                    MatrixDim d_out, const double *v_in) {
+  cudaD_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-inline void cuda_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                              const MatrixDim d) {
-  cudaF_min_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
+                                    MatrixDim d_out, const float *v_in) {
+  cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-inline void cuda_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                              const MatrixDim d) {
-  cudaF_sum_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
+                      const int32_cuda *copy_from, MatrixDim d_out,
+                      MatrixDim d_in) {
+  cudaD_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig,
-                               float changed) {
-  cudaF_replace_value(Gr, Bl, v, dim, orig, changed);
+inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
+                      const int32_cuda *copy_from, MatrixDim d_out,
+                      MatrixDim d_in) {
+  cudaF_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat,
-                              const float *vec_div, MatrixDim d) {
-  cudaF_div_rows_vec(Gr, Bl, mat, vec_div, d);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
+                               const double* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a,
-                                 float param_1, float param_2, float param_3,
-                                 int* flag, int dim) {
-  cudaF_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
+                               const float* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a,
-                                  int dim) {
-  cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
+                               const double* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) {
-  cudaF_vec_soft_max(Gr, Bl, v, dim);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
+                               const float* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
-                         int inc) {
-  cudaF_vec_min(Gr, Bl, v, value, dim, inc);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                     const double* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
-                         int inc) {
-  cudaF_vec_max(Gr, Bl, v, value, dim, inc);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                     const float* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A,
-                                     const float* B, MatrixDim dA, int B_stride,
-                                     float* value) {
-  cudaF_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                     const double* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
-                               MatrixDim dA, int B_stride, float* value) {
-  cudaF_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                     const float* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
-                                      const float* M, const MatrixDim dim_M,
-                                      const float* N, const int stride_N,
-                                      const float beta, float* v) {
-  cudaF_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
+                                const MatrixElement<double>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_dd(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
-                                      const float* M, const int stride_M,
-                                      const float* N, const MatrixDim dim_N,
-                                      const float beta, float* v) {
-  cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_df(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
-                                     const float* M, const int stride_M,
-                                     const float* N, const MatrixDim dim_N,
-                                     const float beta, float* v) {
-  cudaF_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
+                                const MatrixElement<double>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_fd(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v,
-                             const float* x, const float* y, float beta,
-                             int dim) {
-  cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_ff(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                      const float* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaF_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                      const MatrixElement<double>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                      const float* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaF_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                      const MatrixElement<float>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_df_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
-                         int inc) {
-  cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                      const MatrixElement<double>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_fd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
-                                           const float *src, int dim) {
-  cudaF_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                      const MatrixElement<float>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_ff_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
-                                 float* num, int dim) {
-  cudaF_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
+inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
+                              MatrixDim d_out) {
+  cudaD_copy_from_sp(Gr, Bl, x, y, d_out);
 }
-inline void cuda_vec_apply_ceiling(int Gr, int Bl, float* v, float floor_val,
-                                   float* num, int dim) {
-  cudaF_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
+inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
+                              MatrixDim d_out) {
+  cudaF_copy_from_sp(Gr, Bl, x, y, d_out);
 }
-inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) {
-  cudaF_vec_apply_exp(Gr, Bl, v, dim);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
+                              MatrixDim dmat) {
+  cudaD_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) {
-  cudaF_vec_apply_log(Gr, Bl, v, flag, dim);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
+                              MatrixDim dmat) {
+  cudaDF_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) {
-  cudaF_invert_elements(Gr, Bl, data, d);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
+                              MatrixDim dmat) {
+  cudaFD_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-// B_trans nonzero if B transposed.
-inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
-                                  const float *Adata, int A_num_rows,
-                                  int A_num_cols, int A_row_stride,
-                                  int A_col_stride,
-                                  const CuBlockMatrixData *B_cu_data,
-                                  int B_num_blocks, float alpha, float beta,
-                                  int B_trans) {
-  cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
-                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
-                         alpha, beta, B_trans);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
+                              MatrixDim dmat) {
+  cudaF_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
-                                   CuBlockMatrixData *B_cu_data, int num_blocks,
-                                   const float *C_data, int C_num_cols,
-                                   int C_row_stride, int C_col_stride,
-                                   const float *D_data, int D_row_stride,
-                                   int D_col_stride, float alpha, float beta) {
-  cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
-                          C_row_stride, C_col_stride, D_data, D_row_stride,
-                          D_col_stride, alpha, beta);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A,
+                                    const double* B, MatrixDim dmat) {
+  cudaD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-
-/*
- * cu::
- */
-inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x,
-                            MatrixDim d, int src_stride) {
-  cudaF_soft_hinge(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
+                                    MatrixDim dmat) {
+  cudaDF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
-                             MatrixDim d, int src_stride, int group_size,
-                             float power) {
-  cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
+                                    MatrixDim dmat) {
+  cudaFD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
-                                  MatrixDim d, int src_stride, int group_size,
-                                  float power) {
-  cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
+                                    MatrixDim dmat) {
+  cudaF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-inline void cuda_group_max(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           MatrixDim d, int src_stride, int group_size) {
-  cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
+  cudaD_copy_low_upp(Gr, Bl, A, dimA);
 }
-inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x,
-                         MatrixDim d, int src_stride) {
-  cudaF_sigmoid(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
+  cudaF_copy_low_upp(Gr, Bl, A, dimA);
 }
-inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                              const float *y, MatrixDim d, int e_stride,
-                              int y_stride) {
-  cudaF_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst,
+                           const double* const * src, MatrixDim dst_dim) {
+  cudaD_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
 }
-inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                      int src_stride) {
-  cudaF_tanh(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                           const float *y, MatrixDim d, int e_stride,
-                           int y_stride) {
-  cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst,
+                           const float* const * src, MatrixDim dst_dim) {
+  cudaF_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
 }
-inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
-                                 MatrixDim d, int src_stride, const float *a,
-                                 const float *b) {
-  cudaF_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout,
-                                      const float *e, const float *y,
-                                      MatrixDim d, int e_stride, int y_stride,
-                                      const float *a, const float *b) {
-  cudaF_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                                    MatrixDim d_out, const double *v_in) {
+  cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           MatrixDim d, int src_stride) {
-  cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
+                                    MatrixDim d_out, const float *v_in) {
+  cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-// Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK
-//     threads reduce a row at the same time.
-// Gr: the number of rows
-inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
-                                MatrixDim d, int src_stride) {
-  cudaF_softmax_reduce(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, double* const * dst,
+                              const double* src, MatrixDim src_dim) {
+  cudaD_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
 }
-inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, float *y,
-                                    const float *x, MatrixDim y_dim,
-                                    int x_stride) {
-  cudaF_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
+inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, float* const * dst,
+                              const float* src, MatrixDim src_dim) {
+  cudaF_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
 }
-
-inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad,
-                               float l1, float lr, MatrixDim d,
-                               int stride_grad) {
-  cudaF_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
+  cudaD_copy_upp_low(Gr, Bl, A, dimA);
 }
-inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat,
-                                 float *vec_val, int32_cuda *vec_id,
-                                 MatrixDim d) {
-  cudaF_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
+  cudaF_copy_upp_low(Gr, Bl, A, dimA);
 }
-inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                           float *mat_net_out, float *vec_log_post,
-                           MatrixDim d) {
-  cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
+inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id,
+                                  const double *iv, const double *ov,
+                                  const double* od, MatrixDim id_dim,
+                                  int iv_stride, int ov_stride, int od_stride,
+                                  int group_size, double power) {
+  cudaD_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
+                         od_stride, group_size, power);
 }
-inline void cuda_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
-                                   const float *x, MatrixDim x_d,
-                                   float target_rms, bool add_log_stddev) {
-  cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
-                          add_log_stddev);
+inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
+                                  const float *ov, const float* od,
+                                  MatrixDim id_dim, int iv_stride,
+                                  int ov_stride, int od_stride, int group_size,
+                                  float power) {
+  cudaF_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
+                         od_stride, group_size, power);
 }
-inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
-                              const float* value, const int value_stride,
-                              const float* diff, const int diff_stride) {
-  cudaF_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
+inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
+                                  const MatrixDim in_deriv_dim,
+                                  const double* out_value,
+                                  const int out_value_stride,
+                                  const double* out_deriv,
+                                  const int out_deriv_stride,
+                                  double* in_deriv) {
+  cudaD_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
+                         out_deriv, out_deriv_stride, in_deriv);
 }
 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
                                   const MatrixDim in_deriv_dim,
@@ -614,79 +615,157 @@ inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
   cudaF_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
                          out_deriv, out_deriv_stride, in_deriv);
 }
-inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
-                                    MatrixDim d_out, const float *v_in) {
-  cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int num_rows, const double* input,
+                                        const int input_stride,
+                                        const double* params,
+                                        const int params_stride,
+                                        const double* output_deriv,
+                                        const int output_deriv_stride,
+                                        const double* deriv_sum_in,
+                                        const int deriv_sum_in_stride,
+                                        const double* self_repair_config,
+                                        double count, double* input_deriv,
+                                        const int input_deriv_stride,
+                                        double* params_deriv,
+                                        const int params_deriv_stride,
+                                        double* value_sum_out,
+                                        const int value_sum_out_stride,
+                                        double* deriv_sum_out,
+                                        const int deriv_sum_out_stride,
+                                        double* self_repair_sum_out,
+                                        const int self_repair_sum_out_stride) {
+  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+                               params, params_stride, output_deriv,
+                               output_deriv_stride, deriv_sum_in,
+                               deriv_sum_in_stride, self_repair_config, count,
+                               input_deriv, input_deriv_stride, params_deriv,
+                               params_deriv_stride, value_sum_out,
+                               value_sum_out_stride, deriv_sum_out,
+                               deriv_sum_out_stride, self_repair_sum_out,
+                               self_repair_sum_out_stride);
 }
-
-inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           const int32_cuda *copy_from, MatrixDim d_out,
-                           MatrixDim d_in) {
-  cudaF_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int num_rows, const float* input,
+                                        const int input_stride,
+                                        const float* params,
+                                        const int params_stride,
+                                        const float* output_deriv,
+                                        const int output_deriv_stride,
+                                        const double* deriv_sum_in,
+                                        const int deriv_sum_in_stride,
+                                        const float* self_repair_config,
+                                        double count, float* input_deriv,
+                                        const int input_deriv_stride,
+                                        float* params_deriv,
+                                        const int params_deriv_stride,
+                                        double* value_sum_out,
+                                        const int value_sum_out_stride,
+                                        double* deriv_sum_out,
+                                        const int deriv_sum_out_stride,
+                                        float* self_repair_sum_out,
+                                        const int self_repair_sum_out_stride) {
+  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+                               params, params_stride, output_deriv,
+                               output_deriv_stride, deriv_sum_in,
+                               deriv_sum_in_stride, self_repair_config, count,
+                               input_deriv, input_deriv_stride, params_deriv,
+                               params_deriv_stride, value_sum_out,
+                               value_sum_out_stride, deriv_sum_out,
+                               deriv_sum_out_stride, self_repair_sum_out,
+                               self_repair_sum_out_stride);
 }
-
-inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
-                        const int32_cuda *off, MatrixDim d_out,
-                        MatrixDim d_in) {
-  cudaF_splice(Gr, Bl, y, x, off, d_out, d_in);
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                        int id_stride, const double *iv,
+                                        MatrixDim iv_dim, const double* od,
+                                        int od_stride, double target_rms,
+                                        bool add_log_stddev) {
+  cudaD_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+                               target_rms, add_log_stddev);
 }
-inline void cuda_one(int Gr, int Bl, float* x, int dim) {
-  cudaF_one(Gr, Bl, x, dim);
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                        int id_stride, const float *iv,
+                                        MatrixDim iv_dim, const float* od,
+                                        int od_stride, float target_rms,
+                                        bool add_log_stddev) {
+  cudaF_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+                               target_rms, add_log_stddev);
 }
-inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
-                      const int32_cuda *copy_from, MatrixDim d_out,
-                      MatrixDim d_in) {
-  cudaF_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout,
+                                      const double *e, const double *y,
+                                      MatrixDim d, int e_stride, int y_stride,
+                                      const double *a, const double *b) {
+  cudaD_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
 }
-inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
-                              MatrixDim d_out) {
-  cudaF_copy_from_sp(Gr, Bl, x, y, d_out);
+inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout,
+                                      const float *e, const float *y,
+                                      MatrixDim d, int e_stride, int y_stride,
+                                      const float *a, const float *b) {
+  cudaF_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
 }
-inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
-                            MatrixDim d_in) {
-  cudaF_take_lower(Gr, Bl, x, y, d_in);
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                              const double *y, MatrixDim d, int e_stride,
+                              int y_stride) {
+  cudaD_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
-                            MatrixDim d_in) {
-  cudaF_take_upper(Gr, Bl, x, y, d_in);
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                              const float *y, MatrixDim d, int e_stride,
+                              int y_stride) {
+  cudaF_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
-                           MatrixDim d_in) {
-  cudaF_take_mean(Gr, Bl, x, y, d_in);
+inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
+                              const double* value, const int value_stride,
+                              const double* diff, const int diff_stride) {
+  cudaD_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
 }
-inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data,
-                                     MatrixDim dim, float alpha,
-                                     MatrixElement<float>* x,
-                                     int num_elements) {
-  cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
+inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
+                              const float* value, const int value_stride,
+                              const float* diff, const int diff_stride) {
+  cudaF_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
 }
-inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                           float alpha,
-                                           const Int32Pair* indices,
-                                           const float* x, int s, float* data) {
-  cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                           const double *y, MatrixDim d, int e_stride,
+                           int y_stride) {
+  cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x,
-                                int32 size, const float* z, MatrixDim d,
-                                float* z2, MatrixDim d2, float* t) {
-  cudaF_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                           const float *y, MatrixDim d, int e_stride,
+                           int y_stride) {
+  cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                                   const float *src_data, MatrixDim src_dim,
-                                   const Int32Pair *indices) {
-  cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                           double *mat_net_out, double *vec_log_post,
+                           MatrixDim d) {
+  cudaD_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
 }
-inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                                const float *src_data, MatrixDim src_dim,
-                                const Int32Pair *indexes) {
-  cudaF_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                           float *mat_net_out, float *vec_log_post,
+                           MatrixDim d) {
+  cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
 }
-inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
-                               MatrixDim dim, const Int32Pair *indices,
-                               int indices_size, float *output) {
-  cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+inline void cuda_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
+                              MatrixDim dst_d, int src_stride) {
+  cudaD_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
+}
+inline void cuda_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                              MatrixDim dst_d, int src_stride) {
+  cudaF_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
+}
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat,
+                              const double *vec_div, MatrixDim d) {
+  cudaD_div_rows_vec(Gr, Bl, mat, vec_div, d);
+}
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat,
+                              const float *vec_div, MatrixDim d) {
+  cudaF_div_rows_vec(Gr, Bl, mat, vec_div, d);
+}
+inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
+                                    const double *mat2, double *mask,
+                                    MatrixDim mat1_dim, int mat2_stride,
+                                    int mask_stride) {
+  cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
+                           mask_stride);
 }
-
 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
                                     const float *mat2, float *mask,
                                     MatrixDim mat1_dim, int mat2_stride,
@@ -694,675 +773,544 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
   cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
                            mask_stride);
 }
-
-// double versions
-
-/*
- * CuMatrix
- */
-inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
-  cudaD_copy_upp_low(Gr, Bl, A, dimA);
-}
-inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
-  cudaD_copy_low_upp(Gr, Bl, A, dimA);
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat,
+                                 double *vec_val, int32_cuda *vec_id,
+                                 MatrixDim d) {
+  cudaD_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
 }
-inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                                  MatrixDim mat_dim, const double *vec,
-                                  const double *mat2, int mat2_row_stride,
-                                  int mat2_col_stride, double beta) {
-  cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
-                         mat2_row_stride, mat2_col_stride, beta);
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat,
+                                 float *vec_val, int32_cuda *vec_id,
+                                 MatrixDim d) {
+  cudaF_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A,
-                                    const double* B, MatrixDim dmat) {
-  cudaD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_group_max(dim3 Gr, dim3 Bl, double *y, const double *x,
+                           MatrixDim d, int src_stride, int group_size) {
+  cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
-                                    MatrixDim dmat) {
-  cudaDF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_group_max(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           MatrixDim d, int src_stride, int group_size) {
+  cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
-                              MatrixDim dmat) {
-  cudaD_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                             MatrixDim d, int src_stride, int group_size,
+                             double power) {
+  cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
-                              MatrixDim dmat) {
-  cudaDF_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+                             MatrixDim d, int src_stride, int group_size,
+                             float power) {
+  cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  cudaD_apply_exp(Gr, Bl, mat, d);
+inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                                  MatrixDim d, int src_stride, int group_size,
+                                  double power) {
+  cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power,
-                           MatrixDim dim) {
-  cudaD_apply_pow(Gr, Bl, mat, power, dim);
+inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+                                  MatrixDim d, int src_stride, int group_size,
+                                  float power) {
+  cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                               bool include_sign, MatrixDim dim) {
-  cudaD_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x,
+                           MatrixDim d, int src_stride) {
+  cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
-  cudaD_apply_heaviside(Gr, Bl, mat, dim);
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           MatrixDim d, int src_stride) {
+  cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                             MatrixDim dim) {
-  cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) {
+  cudaD_invert_elements(Gr, Bl, data, d);
 }
-inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat,
-                               double ceiling_val, MatrixDim dim) {
-  cudaD_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) {
+  cudaF_invert_elements(Gr, Bl, data, d);
 }
-inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y,
+                                    const double *x, MatrixDim y_dim,
+                                    int x_stride) {
+  cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
 }
-inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                          int src_stride) {
-  cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, float *y,
+                                    const float *x, MatrixDim y_dim,
+                                    int x_stride) {
+  cudaF_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
 }
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
+                                   const int in_stride, const double* params,
+                                   const int params_stride,
+                                   const int out_stride, const int cell_dim,
+                                   const int num_rows, double* out) {
+  cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
+                          out_stride, cell_dim, num_rows, out);
 }
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst,
-                           const double* const * src, MatrixDim dst_dim) {
-  cudaD_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
+inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
+                                   const int in_stride, const float* params,
+                                   const int params_stride,
+                                   const int out_stride, const int cell_dim,
+                                   const int num_rows, float* out) {
+  cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
+                          out_stride, cell_dim, num_rows, out);
 }
-inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, double* const * dst,
-                              const double* src, MatrixDim src_dim) {
-  cudaD_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
+inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data,
+                                     MatrixDim dim, double alpha,
+                                     MatrixElement<double>* x,
+                                     int num_elements) {
+  cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
 }
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                          const double* src, const MatrixIndexT_cuda* reorder,
-                          MatrixDim dst_dim, int src_stride) {
-  cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data,
+                                     MatrixDim dim, float alpha,
+                                     MatrixElement<float>* x,
+                                     int num_elements) {
+  cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
 }
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                          const double* const * src, MatrixDim dst_dim) {
-  cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
+inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                           double alpha,
+                                           const Int32Pair* indices,
+                                           const double* x, int s,
+                                           double* data) {
+  cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
 }
-inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, double alpha,
-                             double* const * dst, const double* src,
-                             MatrixDim src_dim) {
-  cudaD_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
+inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                           float alpha,
+                                           const Int32Pair* indices,
+                                           const float* x, int s, float* data) {
+  cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
 }
-inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) {
-  cudaD_trace(Gr, Bl, mat, value, dim);
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, double *output) {
+  cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
-inline void cuda_set_diag(int Gr, int Bl, double* mat, double value,
-                          MatrixDim d) {
-  cudaD_set_diag(Gr, Bl, mat, value, d);
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, float *output) {
+  cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
-inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value,
-                                 int dim) {
-  cudaD_set_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A,
+                     MatrixDim dst_d, int src_stride) {
+  cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
 }
-inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value,
-                                 int dim) {
-  cudaD_add_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                     MatrixDim dst_d, int src_stride) {
+  cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
 }
-inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value,
-                           MatrixDim d) {
-  cudaD_set_const(Gr, Bl, mat, value, d);
+inline void cuda_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                              const MatrixDim d) {
+  cudaD_max_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat,
-                                     MatrixDim d) {
-  cudaD_set_zero_above_diag(Gr, Bl, mat, d);
+inline void cuda_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                              const MatrixDim d) {
+  cudaF_max_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) {
-  cudaD_add(Gr, Bl, mat, value, d);
+inline void cuda_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
+                     MatrixDim mat_d, int other_stride) {
+  cudaD_min(Gr, Bl, mat, other, mat_d, other_stride);
 }
-inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
-                          const double alpha, int dim) {
-  cudaD_add_vec2(Gr, Bl, mat, vec, alpha, dim);
+inline void cuda_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
+                     MatrixDim mat_d, int other_stride) {
+  cudaF_min(Gr, Bl, mat, other, mat_d, other_stride);
 }
-inline void cuda_scale_diag_packed(int Gr, int Bl, double* mat, double value,
-                                   int dim) {
-  cudaD_scale_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                              const MatrixDim d) {
+  cudaD_min_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value,
-                       MatrixDim d) {
-  cudaD_scale(Gr, Bl, mat, value, d);
+inline void cuda_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                              const MatrixDim d) {
+  cudaF_min_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) {
-  cudaD_apply_log(Gr, Bl, mat, d);
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat,
+                              const double *scale, MatrixDim d) {
+  cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
+}
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
+                              MatrixDim d) {
+  cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
 }
 inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
                               MatrixDim dst_d, int src_stride) {
   cudaD_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
 }
-inline void cuda_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
+inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
                               MatrixDim dst_d, int src_stride) {
-  cudaD_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
-}
-inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A,
-                     MatrixDim dst_d, int src_stride) {
-  cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
-}
-inline void cuda_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
-                     MatrixDim mat_d, int other_stride) {
-  cudaD_min(Gr, Bl, mat, other, mat_d, other_stride);
-}
-inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat,
-                              const double *scale, MatrixDim d) {
-  cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
-}
-inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat,
-                              const double *scale, MatrixDim d) {
-  cudaD_mul_rows_vec(Gr, Bl, mat, scale, d);
+  cudaF_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
 }
 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y,
                                     const double *x, MatrixDim d,
                                     int src_stride, int group_size) {
   cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
 }
-
-inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id,
-                                  const double *iv, const double *ov,
-                                  const double* od, MatrixDim id_dim,
-                                  int iv_stride, int ov_stride, int od_stride,
-                                  int group_size, double power) {
-  cudaD_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
-                         od_stride, group_size, power);
+inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
+                                    MatrixDim d, int src_stride,
+                                    int group_size) {
+  cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
 }
-inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y,
-                                      const double *x1, const double *x2,
-                                      MatrixDim y_dim, int x1_stride,
-                                      int x2_stride, int group_size) {
-  cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
-                             group_size);
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat,
+                              const double *scale, MatrixDim d) {
+  cudaD_mul_rows_vec(Gr, Bl, mat, scale, d);
 }
-inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
-                         double *dst, MatrixDim d, int src_stride,
-                         int A_trans) {
-  cudaD_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
+                              MatrixDim d) {
+  cudaF_mul_rows_vec(Gr, Bl, mat, scale, d);
 }
-inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
-                                const double *src, int32_cuda num_row_blocks,
-                                int32_cuda num_col_blocks, double *dst,
-                                MatrixDim d, int src_stride, int A_trans) {
-  cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
-                       d, src_stride, A_trans);
+inline void cuda_normalize_per_row(size_t Gr, size_t Bl, double *y,
+                                   int y_stride, const double *x, MatrixDim x_d,
+                                   double target_rms, bool add_log_stddev) {
+  cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
+                          add_log_stddev);
 }
-inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
-                                     const double *B, const double *C,
-                                     double *dst, MatrixDim d, int stride_a,
-                                     int stride_b, int stride_c) {
-  cudaD_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
-                            stride_c);
+inline void cuda_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
+                                   const float *x, MatrixDim x_d,
+                                   float target_rms, bool add_log_stddev) {
+  cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
+                          add_log_stddev);
 }
-inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha,
-                                 const double *col, double beta, double *dst,
-                                 MatrixDim d) {
-  cudaD_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
+inline void cuda_one(int Gr, int Bl, double* x, int dim) {
+  cudaD_one(Gr, Bl, x, dim);
 }
-inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha,
-                                 const double *row, double beta, double *dst,
-                                 MatrixDim d) {
-  cudaD_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
+inline void cuda_one(int Gr, int Bl, float* x, int dim) {
+  cudaF_one(Gr, Bl, x, dim);
 }
-inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
-                            const double* T, MatrixDim tdim, double *S,
-                            MatrixDim sdim) {
-  cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
+                                 MatrixDim d, int src_stride, const double *a,
+                                 const double *b) {
+  cudaD_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
 }
-inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                                  MatrixDim mat_dim, const double *mat2,
-                                  int mat2_row_stride, int mat2_col_stride,
-                                  const double *vec, double beta) {
-  cudaD_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
-                         mat2_col_stride, vec, beta);
+inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
+                                 MatrixDim d, int src_stride, const float *a,
+                                 const float *b) {
+  cudaF_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
 }
-inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
-                                      const double *srcA_data,
-                                      const double *srcB_data, MatrixDim dim,
-                                      int srcA_stride, int srcB_stride,
-                                      double alpha, double beta) {
-  cudaD_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
-                             srcA_stride, srcB_stride, alpha, beta);
+inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
+                           const int32_cuda *copy_from, MatrixDim d_out,
+                           MatrixDim d_in) {
+  cudaD_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-
-/*
- * CuVector
- */
-
-inline void cuda_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                              const MatrixDim d) {
-  cudaD_max_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           const int32_cuda *copy_from, MatrixDim d_out,
+                           MatrixDim d_in) {
+  cudaF_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-inline void cuda_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                              const MatrixDim d) {
-  cudaD_min_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad,
+                               double l1, double lr, MatrixDim d,
+                               int stride_grad) {
+  cudaD_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
 }
-inline void cuda_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                              const MatrixDim d) {
-  cudaD_sum_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad,
+                               float l1, float lr, MatrixDim d,
+                               int stride_grad) {
+  cudaF_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
 }
 inline void cuda_replace_value(int Gr, int Bl, double *v, int dim, double orig,
                                double changed) {
   cudaD_replace_value(Gr, Bl, v, dim, orig, changed);
 }
-inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat,
-                              const double *vec_div, MatrixDim d) {
-  cudaD_div_rows_vec(Gr, Bl, mat, vec_div, d);
+inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig,
+                               float changed) {
+  cudaF_replace_value(Gr, Bl, v, dim, orig, changed);
+}
+inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha,
+                                double *x, int incx) {
+  return cublasDscal_v2(handle, n, &alpha, x, incx);
+}
+inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, float alpha,
+                                float *x, int incx) {
+  return cublasSscal_v2(handle, n, &alpha, x, incx);
+}
+inline void cuda_scale_diag_packed(int Gr, int Bl, double* mat, double value,
+                                   int dim) {
+  cudaD_scale_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_scale_diag_packed(int Gr, int Bl, float* mat, float value,
+                                   int dim) {
+  cudaF_scale_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value,
+                       MatrixDim d) {
+  cudaD_scale(Gr, Bl, mat, value, d);
+}
+inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
+  cudaF_scale(Gr, Bl, mat, value, d);
 }
 inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a,
                                  double param_1, double param_2, double param_3,
                                  int* flag, int dim) {
   cudaD_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
 }
-inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
-                                  int dim) {
-  cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
-}
-inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) {
-  cudaD_vec_soft_max(Gr, Bl, v, dim);
+inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a,
+                                 float param_1, float param_2, float param_3,
+                                 int* flag, int dim) {
+  cudaF_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
 }
-inline void cuda_vec_min(int Gr, int Bl, const double* v, double* value,
-                         int dim, int inc) {
-  cudaD_vec_min(Gr, Bl, v, value, dim, inc);
+inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value,
+                           MatrixDim d) {
+  cudaD_set_const(Gr, Bl, mat, value, d);
 }
-inline void cuda_vec_max(int Gr, int Bl, const double* v, double* value,
-                         int dim, int inc) {
-  cudaD_vec_max(Gr, Bl, v, value, dim, inc);
+inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value,
+                           MatrixDim d) {
+  cudaF_set_const(Gr, Bl, mat, value, d);
 }
-inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
-                                     const double* B, MatrixDim dA,
-                                     int B_stride, double* value) {
-  cudaD_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_set_diag(int Gr, int Bl, double* mat, double value,
+                          MatrixDim d) {
+  cudaD_set_diag(Gr, Bl, mat, value, d);
 }
-inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A,
-                               const double* B, MatrixDim dA, int B_stride,
-                               double* value) {
-  cudaD_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_set_diag(int Gr, int Bl, float* mat, float value,
+                          MatrixDim d) {
+  cudaF_set_diag(Gr, Bl, mat, value, d);
 }
-inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
-                                      const double* M, const MatrixDim dim_M,
-                                      const double* N, const int stride_N,
-                                      const double beta, double* v) {
-  cudaD_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value,
+                                 int dim) {
+  cudaD_set_diag_packed(Gr, Bl, mat, value, dim);
 }
-inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
-                                      const double* M, const int stride_M,
-                                      const double* N, const MatrixDim dim_N,
-                                      const double beta, double* v) {
-  cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value,
+                                 int dim) {
+  cudaF_set_diag_packed(Gr, Bl, mat, value, dim);
 }
-inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
-                                     const double* M, const int stride_M,
-                                     const double* N, const MatrixDim dim_N,
-                                     const double beta, double* v) {
-  cudaD_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
+                                     const double *B, const double *C,
+                                     double *dst, MatrixDim d, int stride_a,
+                                     int stride_b, int stride_c) {
+  cudaD_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
+                            stride_c);
 }
-inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v,
-                             const double* x, const double* y, double beta,
-                             int dim) {
-  cudaD_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
-}
-inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                      const double* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaD_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
-}
-inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                      const double* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaD_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
-}
-inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim,
-                         int inc) {
-  cudaD_vec_sum(Gr, Bl, v, value, dim, inc);
-}
-inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
-                                           const double *src, int dim) {
-  cudaD_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
-}
-inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
-                                 float* num, int dim) {
-  cudaD_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
-}
-inline void cuda_vec_apply_ceiling(int Gr, int Bl, double* v, double floor_val,
-                                   float* num, int dim) {
-  cudaD_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
-}
-inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) {
-  cudaD_vec_apply_exp(Gr, Bl, v, dim);
+inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A,
+                                     const float *B, const float *C, float *dst,
+                                     MatrixDim d, int stride_a, int stride_b,
+                                     int stride_c) {
+  cudaF_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
+                            stride_c);
 }
-inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag,
-                               int dim) {
-  cudaD_vec_apply_log(Gr, Bl, v, flag, dim);
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat,
+                                     MatrixDim d) {
+  cudaD_set_zero_above_diag(Gr, Bl, mat, d);
 }
-inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) {
-  cudaD_invert_elements(Gr, Bl, data, d);
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat,
+                                     MatrixDim d) {
+  cudaF_set_zero_above_diag(Gr, Bl, mat, d);
 }
-// B_trans nonzero if B transposed.
-inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
-                                  const double *Adata, int A_num_rows,
-                                  int A_num_cols, int A_row_stride,
-                                  int A_col_stride,
-                                  const CuBlockMatrixData *B_cu_data,
-                                  int B_num_blocks, double alpha, double beta,
-                                  int B_trans) {
-  cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
-                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
-                         alpha, beta, B_trans);
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x,
+                         MatrixDim d, int src_stride) {
+  cudaD_sigmoid(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
-                                   CuBlockMatrixData *B_cu_data, int num_blocks,
-                                   const double *C_data, int C_num_cols,
-                                   int C_row_stride, int C_col_stride,
-                                   const double *D_data, int D_row_stride,
-                                   int D_col_stride, double alpha,
-                                   double beta) {
-  cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
-                          C_row_stride, C_col_stride, D_data, D_row_stride,
-                          D_col_stride, alpha, beta);
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x,
+                         MatrixDim d, int src_stride) {
+  cudaF_sigmoid(Gr, Bl, y, x, d, src_stride);
 }
-
-/*
- * cu::
- */
 inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x,
                             MatrixDim d, int src_stride) {
   cudaD_soft_hinge(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                             MatrixDim d, int src_stride, int group_size,
-                             double power) {
-  cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x,
+                            MatrixDim d, int src_stride) {
+  cudaF_soft_hinge(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                                  MatrixDim d, int src_stride, int group_size,
-                                  double power) {
-  cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y,
+                                const double *x, MatrixDim d, int src_stride) {
+  cudaD_softmax_reduce(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_group_max(dim3 Gr, dim3 Bl, double *y, const double *x,
-                           MatrixDim d, int src_stride, int group_size) {
-  cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size);
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
+                                MatrixDim d, int src_stride) {
+  cudaF_softmax_reduce(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x,
-                         MatrixDim d, int src_stride) {
-  cudaD_sigmoid(Gr, Bl, y, x, d, src_stride);
+inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
+                        const int32_cuda *off, MatrixDim d_out,
+                        MatrixDim d_in) {
+  cudaD_splice(Gr, Bl, y, x, off, d_out, d_in);
 }
-inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                              const double *y, MatrixDim d, int e_stride,
-                              int y_stride) {
-  cudaD_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
+                        const int32_cuda *off, MatrixDim d_out,
+                        MatrixDim d_in) {
+  cudaF_splice(Gr, Bl, y, x, off, d_out, d_in);
 }
-inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                      int src_stride) {
-  cudaD_tanh(Gr, Bl, y, x, d, src_stride);
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data,
+                                   MatrixDim dim, const double *src_data,
+                                   MatrixDim src_dim,
+                                   const Int32Pair *indices) {
+  cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
 }
-inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                           const double *y, MatrixDim d, int e_stride,
-                           int y_stride) {
-  cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                                   const float *src_data, MatrixDim src_dim,
+                                   const Int32Pair *indices) {
+  cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
 }
-inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
-                                 MatrixDim d, int src_stride, const double *a,
-                                 const double *b) {
-  cudaD_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
+inline void cuda_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                              const MatrixDim d) {
+  cudaD_sum_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout,
-                                      const double *e, const double *y,
-                                      MatrixDim d, int e_stride, int y_stride,
-                                      const double *a, const double *b) {
-  cudaD_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
+inline void cuda_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                              const MatrixDim d) {
+  cudaF_sum_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x,
-                           MatrixDim d, int src_stride) {
-  cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
+                            const double* T, MatrixDim tdim, double *S,
+                            MatrixDim sdim) {
+  cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
 }
-inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y,
-                                const double *x, MatrixDim d, int src_stride) {
-  cudaD_softmax_reduce(Gr, Bl, y, x, d, src_stride);
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta,
+                            const float* T, MatrixDim tdim, float *S,
+                            MatrixDim sdim) {
+  cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
 }
-inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y,
-                                    const double *x, MatrixDim y_dim,
-                                    int x_stride) {
-  cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
+                            MatrixDim d_in) {
+  cudaD_take_lower(Gr, Bl, x, y, d_in);
 }
-inline void cuda_normalize_per_row(size_t Gr, size_t Bl, double *y,
-                                   int y_stride, const double *x, MatrixDim x_d,
-                                   double target_rms, bool add_log_stddev) {
-  cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
-                          add_log_stddev);
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
+                            MatrixDim d_in) {
+  cudaF_take_lower(Gr, Bl, x, y, d_in);
 }
-
-inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad,
-                               double l1, double lr, MatrixDim d,
-                               int stride_grad) {
-  cudaD_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
+                           MatrixDim d_in) {
+  cudaD_take_mean(Gr, Bl, x, y, d_in);
 }
-inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat,
-                                 double *vec_val, int32_cuda *vec_id,
-                                 MatrixDim d) {
-  cudaD_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
+                           MatrixDim d_in) {
+  cudaF_take_mean(Gr, Bl, x, y, d_in);
 }
-inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                           double *mat_net_out, double *vec_log_post,
-                           MatrixDim d) {
-  cudaD_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
+                            MatrixDim d_in) {
+  cudaD_take_upper(Gr, Bl, x, y, d_in);
 }
-inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
-                              const double* value, const int value_stride,
-                              const double* diff, const int diff_stride) {
-  cudaD_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
+                            MatrixDim d_in) {
+  cudaF_take_upper(Gr, Bl, x, y, d_in);
 }
-inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
-                                  const MatrixDim in_deriv_dim,
-                                  const double* out_value,
-                                  const int out_value_stride,
-                                  const double* out_deriv,
-                                  const int out_deriv_stride,
-                                  double* in_deriv) {
-  cudaD_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
-                         out_deriv, out_deriv_stride, in_deriv);
+inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                      int src_stride) {
+  cudaD_tanh(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                                    MatrixDim d_out, const double *v_in) {
-  cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                      int src_stride) {
+  cudaF_tanh(Gr, Bl, y, x, d, src_stride);
 }
-
-inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
-                           const int32_cuda *copy_from, MatrixDim d_out,
-                           MatrixDim d_in) {
-  cudaD_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) {
+  cudaD_trace(Gr, Bl, mat, value, dim);
 }
-inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
-                        const int32_cuda *off, MatrixDim d_out,
-                        MatrixDim d_in) {
-  cudaD_splice(Gr, Bl, y, x, off, d_out, d_in);
+inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) {
+  cudaF_trace(Gr, Bl, mat, value, dim);
 }
-inline void cuda_one(int Gr, int Bl, double* x, int dim) {
-  cudaD_one(Gr, Bl, x, dim);
+inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A,
+                               const double* B, MatrixDim dA, int B_stride,
+                               double* value) {
+  cudaD_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
 }
-inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
-                      const int32_cuda *copy_from, MatrixDim d_out,
-                      MatrixDim d_in) {
-  cudaD_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
+                               MatrixDim dA, int B_stride, float* value) {
+  cudaF_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
 }
-inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
-                              MatrixDim d_out) {
-  cudaD_copy_from_sp(Gr, Bl, x, y, d_out);
+inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
+                                     const double* B, MatrixDim dA,
+                                     int B_stride, double* value) {
+  cudaD_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
 }
-inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
-                            MatrixDim d_in) {
-  cudaD_take_lower(Gr, Bl, x, y, d_in);
+inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A,
+                                     const float* B, MatrixDim dA, int B_stride,
+                                     float* value) {
+  cudaF_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
 }
-inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
-                            MatrixDim d_in) {
-  cudaD_take_upper(Gr, Bl, x, y, d_in);
+inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in,
+                                const MatrixElement<double>* smat_in,
+                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                                double* trace_vec_out) {
+  cudaD_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                       trace_vec_out);
 }
-inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
-                           MatrixDim d_in) {
-  cudaD_take_mean(Gr, Bl, x, y, d_in);
+inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                                float* trace_vec_out) {
+  cudaF_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                       trace_vec_out);
 }
-inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data,
-                                     MatrixDim dim, double alpha,
-                                     MatrixElement<double>* x,
-                                     int num_elements) {
-  cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
+inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
+                                      const MatrixElement<double>* smat_in,
+                                      MatrixDim mat_d_in,
+                                      MatrixIndexT_cuda smat_d_in,
+                                      double* trace_vec_out) {
+  cudaD_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                             trace_vec_out);
 }
-inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                           double alpha,
-                                           const Int32Pair* indices,
-                                           const double* x, int s,
-                                           double* data) {
-  cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
+inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
+                                      const MatrixElement<float>* smat_in,
+                                      MatrixDim mat_d_in,
+                                      MatrixIndexT_cuda smat_d_in,
+                                      float* trace_vec_out) {
+  cudaF_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                             trace_vec_out);
 }
-inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x,
-                                int32 size, const double* z, MatrixDim d,
-                                double* z2, MatrixDim d2, double* t) {
-  cudaD_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
+inline void cuda_vec_apply_ceiling(int Gr, int Bl, double* v, double floor_val,
+                                   float* num, int dim) {
+  cudaD_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
 }
-inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data,
-                                   MatrixDim dim, const double *src_data,
-                                   MatrixDim src_dim,
-                                   const Int32Pair *indices) {
-  cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+inline void cuda_vec_apply_ceiling(int Gr, int Bl, float* v, float floor_val,
+                                   float* num, int dim) {
+  cudaF_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
 }
-inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                                const double *src_data, MatrixDim src_dim,
-                                const Int32Pair *indexes) {
-  cudaD_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
+inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) {
+  cudaD_vec_apply_exp(Gr, Bl, v, dim);
 }
-inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
-                               MatrixDim dim, const Int32Pair *indices,
-                               int indices_size, double *output) {
-  cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) {
+  cudaF_vec_apply_exp(Gr, Bl, v, dim);
 }
-
-inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
-                                    const double *mat2, double *mask,
-                                    MatrixDim mat1_dim, int mat2_stride,
-                                    int mask_stride) {
-  cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
-                           mask_stride);
+inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
+                                 float* num, int dim) {
+  cudaD_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
 }
-
-// Also include some template-friendly wrappers of cublas functions:
-inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha,
-                                const float *x, int incx, float *y, int incy) {
-  return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
+inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
+                                 float* num, int dim) {
+  cudaF_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
 }
-inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha,
-                                const double *x, int incx, double *y,
-                                int incy) {
-  return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
+inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag,
+                               int dim) {
+  cudaD_vec_apply_log(Gr, Bl, v, flag, dim);
 }
-inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, float alpha,
-                                float *x, int incx) {
-  return cublasSscal_v2(handle, n, &alpha, x, incx);
+inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) {
+  cudaF_vec_apply_log(Gr, Bl, v, flag, dim);
 }
-inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha,
-                                double *x, int incx) {
-  return cublasDscal_v2(handle, n, &alpha, x, incx);
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
+                                           const double *src, int dim) {
+  cudaD_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
 }
-
-inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
-                                   const int in_stride, const double* params,
-                                   const int params_stride,
-                                   const int out_stride, const int cell_dim,
-                                   const int num_rows, double* out) {
-  cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
+                                           const float *src, int dim) {
+  cudaF_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
 }
-inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
-                                   const int in_stride, const float* params,
-                                   const int params_stride,
-                                   const int out_stride, const int cell_dim,
-                                   const int num_rows, float* out) {
-  cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+inline void cuda_vec_max(int Gr, int Bl, const double* v, double* value,
+                         int dim, int inc) {
+  cudaD_vec_max(Gr, Bl, v, value, dim, inc);
 }
-inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                        const int num_rows, const double* input,
-                                        const int input_stride,
-                                        const double* params,
-                                        const int params_stride,
-                                        const double* output_deriv,
-                                        const int output_deriv_stride,
-                                        const double* deriv_sum_in,
-                                        const int deriv_sum_in_stride,
-                                        const double* self_repair_config,
-                                        double count, double* input_deriv,
-                                        const int input_deriv_stride,
-                                        double* params_deriv,
-                                        const int params_deriv_stride,
-                                        double* value_sum_out,
-                                        const int value_sum_out_stride,
-                                        double* deriv_sum_out,
-                                        const int deriv_sum_out_stride,
-                                        double* self_repair_sum_out,
-                                        const int self_repair_sum_out_stride) {
-  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
-                               params, params_stride, output_deriv,
-                               output_deriv_stride, deriv_sum_in,
-                               deriv_sum_in_stride, self_repair_config, count,
-                               input_deriv, input_deriv_stride, params_deriv,
-                               params_deriv_stride, value_sum_out,
-                               value_sum_out_stride, deriv_sum_out,
-                               deriv_sum_out_stride, self_repair_sum_out,
-                               self_repair_sum_out_stride);
+inline void cuda_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
+                         int inc) {
+  cudaF_vec_max(Gr, Bl, v, value, dim, inc);
 }
-inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                        const int num_rows, const float* input,
-                                        const int input_stride,
-                                        const float* params,
-                                        const int params_stride,
-                                        const float* output_deriv,
-                                        const int output_deriv_stride,
-                                        const double* deriv_sum_in,
-                                        const int deriv_sum_in_stride,
-                                        const float* self_repair_config,
-                                        double count, float* input_deriv,
-                                        const int input_deriv_stride,
-                                        float* params_deriv,
-                                        const int params_deriv_stride,
-                                        double* value_sum_out,
-                                        const int value_sum_out_stride,
-                                        double* deriv_sum_out,
-                                        const int deriv_sum_out_stride,
-                                        float* self_repair_sum_out,
-                                        const int self_repair_sum_out_stride) {
-  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
-                               params, params_stride, output_deriv,
-                               output_deriv_stride, deriv_sum_in,
-                               deriv_sum_in_stride, self_repair_config, count,
-                               input_deriv, input_deriv_stride, params_deriv,
-                               params_deriv_stride, value_sum_out,
-                               value_sum_out_stride, deriv_sum_out,
-                               deriv_sum_out_stride, self_repair_sum_out,
-                               self_repair_sum_out_stride);
+inline void cuda_vec_min(int Gr, int Bl, const double* v, double* value,
+                         int dim, int inc) {
+  cudaD_vec_min(Gr, Bl, v, value, dim, inc);
 }
-
-inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                                    MatrixDim d_out, const double *v_in) {
-  cudaD_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
+                         int inc) {
+  cudaF_vec_min(Gr, Bl, v, value, dim, inc);
 }
-inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
-                                    MatrixDim d_out, const float *v_in) {
-  cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
+                                  int dim) {
+  cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
 }
-
-inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
-                                        int id_stride, const double *iv,
-                                        MatrixDim iv_dim, const double* od,
-                                        int od_stride, double target_rms,
-                                        bool add_log_stddev) {
-  cudaD_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
-                               target_rms, add_log_stddev);
+inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a,
+                                  int dim) {
+  cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
 }
-inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
-                                        int id_stride, const float *iv,
-                                        MatrixDim iv_dim, const float* od,
-                                        int od_stride, float target_rms,
-                                        bool add_log_stddev) {
-  cudaF_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
-                               target_rms, add_log_stddev);
+inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) {
+  cudaD_vec_soft_max(Gr, Bl, v, dim);
+}
+inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) {
+  cudaF_vec_soft_max(Gr, Bl, v, dim);
+}
+inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim,
+                         int inc) {
+  cudaD_vec_sum(Gr, Bl, v, value, dim, inc);
+}
+inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
+                         int inc) {
+  cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
 }
 
 } // namespace kaldi

From 041934fc2fac924fc55cdf0bf17915b3f5c8bf0f Mon Sep 17 00:00:00 2001
From: Eduardo Silva <zedudu@gmail.com>
Date: Sun, 19 Mar 2017 16:27:20 -0300
Subject: [PATCH 193/213] [build] Android compilation, bug-fixes (#1502)

---
 src/configure                     | 6 +++---
 src/makefiles/android_openblas.mk | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/configure b/src/configure
index a4f3ce1c8b3..4bfe6bc8470 100755
--- a/src/configure
+++ b/src/configure
@@ -952,7 +952,7 @@ do
     static_fst=true;
     dynamic_kaldi=false;
     MATHLIB='OPENBLAS';
-    ANDROIDINCDIR=`read_dirname $1`;
+    ANDROIDINC=`read_dirname $1`;
     shift;;
   *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
   esac
@@ -1076,7 +1076,7 @@ echo >> kaldi.mk
 echo "Doing OS specific configurations ..."
 
 if $android ; then
-  if [ -z $ANDROIDINCDIR ] ;  then
+  if [ -z $ANDROIDINC ] ;  then
     failure "--android-incdir must be specified for android builds."
   fi
 
@@ -1097,7 +1097,7 @@ if $android ; then
   OPENBLASLIBS="$OPENBLASROOT/lib/libopenblas.a $OPENBLASROOT/lib/libclapack.a $OPENBLASROOT/lib/liblapack.a $OPENBLASROOT/lib/libblas.a $OPENBLASROOT/lib/libf2c.a"
   echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
   echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
-  echo "ANDROIDINCDIR = $ANDROIDINCDIR" >> kaldi.mk
+  echo "ANDROIDINC = $ANDROIDINC" >> kaldi.mk
 
   cat makefiles/android_openblas.mk >> kaldi.mk
 
diff --git a/src/makefiles/android_openblas.mk b/src/makefiles/android_openblas.mk
index c8f60f4fa4f..e1dea65a881 100644
--- a/src/makefiles/android_openblas.mk
+++ b/src/makefiles/android_openblas.mk
@@ -29,7 +29,7 @@ CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self -Wno-mismatched-tags \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -DANDROID_BUILD \
+           -DHAVE_CXXABI_H -DHAVE_OPENBLAS -DANDROID_BUILD \
            -I$(OPENBLASINC) -I$(ANDROIDINC) -ftree-vectorize -mfloat-abi=hard \
            -mfpu=neon -mhard-float -D_NDK_MATH_NO_SOFTFP=1 -pthread \
            -g # -O0 -DKALDI_PARANOID

From f8f83ad64ef95bccc29b20fc5a563fba5a141955 Mon Sep 17 00:00:00 2001
From: Dogan Can <dogancanbaz@gmail.com>
Date: Sun, 19 Mar 2017 16:01:40 -0700
Subject: [PATCH 194/213] [doc] Add a note to README.md about Android cross
 compilation (#1503)

---
 README.md | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 32d4945a909..73abe9f1e3f 100644
--- a/README.md
+++ b/README.md
@@ -40,25 +40,30 @@ Development pattern for contributors
 ------------------------------------
 
 1. [Create a personal fork](https://help.github.com/articles/fork-a-repo/)
-   of the [main Kaldi repository] (https://github.com/kaldi-asr/kaldi) in GitHub.
+   of the [main Kaldi repository](https://github.com/kaldi-asr/kaldi) in GitHub.
 2. Make your changes in a named branch different from `master`, e.g. you create
    a branch `my-awesome-feature`.
 3. [Generate a pull request](https://help.github.com/articles/creating-a-pull-request/)
    through the Web interface of GitHub.
-4. As a general rule, please follow [Google C++ Style Guide]
-   (https://google.github.io/styleguide/cppguide.html).
+4. As a general rule, please follow [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
    There are a [few exceptions in Kaldi](http://kaldi-asr.org/doc/style.html).
-   You can use the [Google's cpplint.py]
-   (https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py)
+   You can use the [Google's cpplint.py](https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py)
    to verify that your code is free of basic mistakes.
 
 Platform specific notes
 -----------------------
 
-PowerPC 64bits little-endian (ppc64le):
+### PowerPC 64bits little-endian (ppc64le)
+
 - Kaldi is expected to work out of the box in RHEL >= 7 and Ubuntu >= 16.04 with
   OpenBLAS, ATLAS, or CUDA.
-- CUDA drivers for ppc64le can be found at [https://developer.nvidia.com/cuda-downloads]
-  (https://developer.nvidia.com/cuda-downloads).
-- An [IBM Redbook] (https://www.redbooks.ibm.com/abstracts/redp5169.html) is
+- CUDA drivers for ppc64le can be found at [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
+- An [IBM Redbook](https://www.redbooks.ibm.com/abstracts/redp5169.html) is
   available as a guide to install and configure CUDA.
+
+### Android
+
+- Kaldi supports cross compiling for Android using Android NDK, clang++ and
+  OpenBLAS.
+- See [this blog post](http://jcsilva.github.io/2017/03/18/compile-kaldi-android/)
+  for details.

From d38d0679d6705111193f5fc3e02752f047652f72 Mon Sep 17 00:00:00 2001
From: Vijayaditya Peddinti <vijayaditya@users.noreply.github.com>
Date: Mon, 20 Mar 2017 17:52:54 -0700
Subject: [PATCH 195/213] [egs] ami : Added tdnn_lstm recipe with fast-lstmp
 layer. Added tdnn_lstm recipe with -1 delay at lowest lstm layer (#1505)

swbd : Added tdnn_lstm recipe with delay -1 at the lowest lstm layer
---
 egs/ami/s5b/local/chain/run_tdnn_lstm.sh      |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1j.sh    | 306 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1k.sh    | 302 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_blstm_1a.sh   |   5 +-
 .../local/chain/tuning/run_tdnn_lstm_1j.sh    | 311 ++++++++++++++++++
 5 files changed, 924 insertions(+), 2 deletions(-)
 create mode 100755 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
 create mode 100755 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh

diff --git a/egs/ami/s5b/local/chain/run_tdnn_lstm.sh b/egs/ami/s5b/local/chain/run_tdnn_lstm.sh
index 43145248fbd..23906f31954 120000
--- a/egs/ami/s5b/local/chain/run_tdnn_lstm.sh
+++ b/egs/ami/s5b/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1i.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1j.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
new file mode 100755
index 00000000000..008060df070
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+
+# 1j is same as 1i but with changes related to fast-lstmp layer
+# changed num-chunk-per-minibatch to be variable
+# added extra_left_context_initial=0
+# and extra_right_context_final=0
+# These changes are similar to those between swbd's run_tdnn_lstm_1{c,d}.sh
+# recipes
+
+# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned \
+#System            tdnn_lstm1i_sp_bi_ihmali_ld5 tdnn_lstm1j_sp_bi_ihmali_ld5
+#WER on dev        37.6      37.3
+#WER on eval        40.9      40.4
+#Final train prob      -0.114135 -0.118532
+#Final valid prob      -0.245208 -0.245593
+#Final train prob (xent)      -1.47648  -1.48337
+#Final valid prob (xent)      -2.16365  -2.11097
+
+# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/
+# exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.142->-0.131 xent:train/valid[57,86,final]=(-1.78,-1.48,-1.48/-2.22,-2.17,-2.16) logprob:train/valid[57,86,final]=(-0.157,-0.117,-0.114/-0.243,-0.249,-0.245)
+# exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.139->-0.130 xent:train/valid[57,86,final]=(-1.82,-1.50,-1.48/-2.18,-2.12,-2.11) logprob:train/valid[57,86,final]=(-0.165,-0.121,-0.119/-0.240,-0.247,-0.246)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1j  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
new file mode 100755
index 00000000000..b8d947d8e92
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -0,0 +1,302 @@
+#!/bin/bash
+
+# 1k is same as 1j but with  smaller delay on the first lstm layer
+# there is a 37% increase in training time 11hrs vs 8hrs and the gains are modest
+
+# Results with flags :  --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+#System           tdnn_lstm1j_sp_bi_ihmali_ld5 tdnn_lstm1k_sp_bi_ihmali_ld5
+#WER on dev        37.3      36.9
+#WER on eval        40.4      40.0
+#Final train prob      -0.118532 -0.119421
+#Final valid prob      -0.245593  -0.24915
+#Final train prob (xent)      -1.48337  -1.48024
+#Final valid prob (xent)      -2.11097   -2.1196
+
+#steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1k_sp_bi_ihmali_ld5
+# exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.139->-0.130 xent:train/valid[57,86,final]=(-1.82,-1.50,-1.48/-2.18,-2.12,-2.11) logprob:train/valid[57,86,final]=(-0.165,-0.121,-0.119/-0.240,-0.247,-0.246)
+# exp/sdm1/chain_cleaned/tdnn_lstm1k_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.140->-0.130 xent:train/valid[57,86,final]=(-1.81,-1.49,-1.48/-2.19,-2.13,-2.12) logprob:train/valid[57,86,final]=(-0.163,-0.121,-0.119/-0.242,-0.249,-0.249)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1k  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-1 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
index 60a08136134..12b63b7e96a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
@@ -1,8 +1,11 @@
 #!/bin/bash
 
 # tdnn_blstm_1a is same as blstm_6k, but with the initial tdnn layers
+# blstm_6k : num-parameters: 41155430
+# tdnn_blstm_1a : num-parameters: 53688166
+
 # local/chain/compare_wer_general.sh blstm_6l_sp blstm_6k_sp
-# System                blstm_6k_sp tdnn_blstm_6l_sp
+# System                blstm_6k_sp tdnn_blstm_1a_sp
 # WER on train_dev(tg)      13.25     12.95
 # WER on train_dev(fg)      12.27     11.98
 # WER on eval2000(tg)        15.7      15.5
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
new file mode 100755
index 00000000000..6a6a4ba30e1
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# same as 1e but with delay of -1
+# System                tdnn_lstm_1e_sp tdnn_lstm_1j_sp
+# WER on train_dev(tg)  12.74           12.95
+# WER on train_dev(fg)  11.70           12.01
+# WER on eval2000(tg)   15.7            15.3
+# WER on eval2000(fg)   14.3            13.9
+# Final train prob      -0.066          -0.066
+# Final valid prob      -0.087          -0.089
+# Final train prob (xent) -0.931        -0.921
+# Final valid prob (xent) -1.0279      -1.0363
+# exp/chain/tdnn_lstm_1j_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6067 combine=-0.076->-0.074 xent:train/valid[173,261,final]=(-1.08,-0.925,-0.921/-1.17,-1.04,-1.04) logprob:train/valid[173,261,final]=(-0.085,-0.067,-0.066/-0.103,-0.090,-0.089)
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1j # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-1 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;

From 0a6b38e4603c980d5ed2a649e744321a83a7d801 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Wed, 22 Mar 2017 19:31:21 +0100
Subject: [PATCH 196/213] [scripts] prevent failure when final.ie.id doesn't
 exist (#1508)

---
 egs/wsj/s5/steps/nnet3/align.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index 1ae5218aa85..5ecf4d5624a 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -64,7 +64,7 @@ fi
 extra_files=
 if [ ! -z "$online_ivector_dir" ]; then
   steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
-  extra_files="$srcdir/final.ie.id $online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
 fi
 
 for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do

From d954b93e081413e3dbe67ca1190cce2c32c8504a Mon Sep 17 00:00:00 2001
From: Alok Parlikar <alok@parlikar.com>
Date: Thu, 23 Mar 2017 08:11:10 +0530
Subject: [PATCH 197/213] [src] Fix exit code of extract-rows.cc (#1510)

---
 src/featbin/extract-rows.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/featbin/extract-rows.cc b/src/featbin/extract-rows.cc
index e14f9cc0e82..e4e2a927e6b 100644
--- a/src/featbin/extract-rows.cc
+++ b/src/featbin/extract-rows.cc
@@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Processed " << num_done << " segments successfully; "
               << "errors on " << num_err;
 
-    return (num_done > 0);
+    return (num_done > 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;

From 5316e68547652b43dc1cb6161e0614b98e4c31a0 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Thu, 23 Mar 2017 20:35:10 +0100
Subject: [PATCH 198/213] [egs] fixes to babel pipeline; thanks to Fred
 Richardson (#1509)

---
 egs/babel/s5d/conf/common.fullLP              |   4 +-
 egs/babel/s5d/conf/common_vars.sh             |   1 +
 .../conf/lang/104-pashto-fullLP.official.conf |   4 +-
 .../lang/105-turkish-fullLP.official.conf     |   9 +-
 .../conf/lang/305-guarani.FLP.official.conf   |  18 +-
 .../s5d/conf/lang/306-igbo.FLP.official.conf  |   6 +-
 .../conf/lang/307-amharic.FLP.official.conf   |  10 +-
 .../conf/lang/401-mongolian.FLP.official.conf |   6 +-
 .../conf/lang/402-javanese.FLP.official.conf  |   8 +-
 .../conf/lang/403-dholuo.FLP.official.conf    |   6 +-
 .../conf/lang/404-georgian.FLP.official.conf  |  78 ++
 .../conf/lang/404-georgian.LLP.official.conf  |  54 +
 .../s5d/conf/lists/404-georgian/dev.2h.list   | 124 +++
 .../s5d/conf/lists/404-georgian/dev.list      | 124 +++
 .../s5d/conf/lists/404-georgian/eval.list     | 956 ++++++++++++++++++
 .../conf/lists/404-georgian/sub-train.list    | 124 +++
 .../404-georgian/sub-train.untranscribed.list | 929 +++++++++++++++++
 .../s5d/conf/lists/404-georgian/training.list | 518 ++++++++++
 .../404-georgian/untranscribed-training.list  | 535 ++++++++++
 egs/babel/s5d/local/arpa2G.sh                 |   3 +-
 egs/babel/s5d/local/chain/run_blstm.sh        |   2 +-
 egs/babel/s5d/local/chain/run_blstm_bab1.sh   |   2 +-
 egs/babel/s5d/local/chain/run_blstm_bab2.sh   |   2 +-
 egs/babel/s5d/local/chain/run_blstm_bab3.sh   |   2 +-
 egs/babel/s5d/local/chain/run_blstm_bab4.sh   |   2 +-
 egs/babel/s5d/local/chain/run_blstm_bab5.sh   |   2 +-
 .../s5d/local/chain/run_blstm_xconfig.sh      | 206 ++++
 .../s5d/local/chain/run_ivector_common.sh     |   6 +-
 egs/babel/s5d/local/chain/run_tdnn.sh         |   2 +-
 egs/babel/s5d/local/chain/run_tdnn_bab1.sh    |   2 +-
 egs/babel/s5d/local/chain/run_tdnn_bab2.sh    |   2 +-
 egs/babel/s5d/local/chain/run_tdnn_bab3.sh    |   3 +-
 egs/babel/s5d/local/chain/run_tdnn_bab4.sh    |   2 +-
 egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh | 227 +++++
 egs/babel/s5d/local/check_tools.sh            |   8 +-
 egs/babel/s5d/local/extend_lexicon.sh         |  23 +-
 .../s5d/local/generate_confusion_matrix.sh    |   5 +-
 .../s5d/local/lexicon/make_unicode_lexicon.py |  15 +-
 .../create_new_language_configs.LLP.sh        |   5 +-
 egs/babel/s5d/local/nnet3/run_blstm.sh        |   2 +-
 .../s5d/local/nnet3/run_ivector_common.sh     |   4 +-
 .../run_ivector_multicondition_common.sh      |   8 +-
 egs/babel/s5d/local/nnet3/run_lstm.sh         |   3 +-
 .../s5d/local/nnet3/run_lstm_realigned.sh     |   2 +-
 egs/babel/s5d/local/nnet3/run_tdnn.sh         |   2 +-
 egs/babel/s5d/local/reestimate_langp.sh       |   1 +
 egs/babel/s5d/local/run_kws_stt_task2.sh      |  16 +-
 egs/babel/s5d/local/search/run_phn_search.sh  |   8 +-
 egs/babel/s5d/local/search/run_search.sh      |   7 +-
 egs/babel/s5d/local/search/run_syll_search.sh |   6 +-
 egs/babel/s5d/local/search/search.sh          |   1 +
 .../s5d/local/syllab/lattice_word2syll.sh     |   2 +-
 egs/babel/s5d/local/syllab/run_phones.sh      |  12 +-
 egs/babel/s5d/local/syllab/run_syllabs.sh     |  15 +-
 .../s5d/run-1-main-unicode-extend-lex.sh      | 209 ++++
 egs/babel/s5d/run-1-main-unicode.sh           |   2 +-
 egs/babel/s5d/run-4-anydecode.sh              | 214 ++--
 57 files changed, 4359 insertions(+), 190 deletions(-)
 create mode 100644 egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
 create mode 100644 egs/babel/s5d/conf/lang/404-georgian.LLP.official.conf
 create mode 100644 egs/babel/s5d/conf/lists/404-georgian/dev.2h.list
 create mode 100644 egs/babel/s5d/conf/lists/404-georgian/dev.list
 create mode 100644 egs/babel/s5d/conf/lists/404-georgian/eval.list
 create mode 100644 egs/babel/s5d/conf/lists/404-georgian/sub-train.list
 create mode 100644 egs/babel/s5d/conf/lists/404-georgian/sub-train.untranscribed.list
 create mode 100644 egs/babel/s5d/conf/lists/404-georgian/training.list
 create mode 100644 egs/babel/s5d/conf/lists/404-georgian/untranscribed-training.list
 create mode 100755 egs/babel/s5d/local/chain/run_blstm_xconfig.sh
 create mode 100755 egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh
 create mode 100755 egs/babel/s5d/run-1-main-unicode-extend-lex.sh

diff --git a/egs/babel/s5d/conf/common.fullLP b/egs/babel/s5d/conf/common.fullLP
index d203908d3e0..05dea74beb0 100644
--- a/egs/babel/s5d/conf/common.fullLP
+++ b/egs/babel/s5d/conf/common.fullLP
@@ -35,10 +35,10 @@ babel_type=full
 
 use_pitch=true
 
-lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 18 )
+lmwt_plp_extra_opts=( --min-lmwt 9 --max-lmwt 13 )
 lmwt_bnf_extra_opts=( --min-lmwt 15 --max-lmwt 22 )
 lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 15 )
-lmwt_chain_extra_opts=( --min-lmwt 4 --max-lmwt 22 )
+lmwt_chain_extra_opts=( --min-lmwt 9 --max-lmwt 13 )
 
 dnn_beam=16.0
 dnn_lat_beam=8.5
diff --git a/egs/babel/s5d/conf/common_vars.sh b/egs/babel/s5d/conf/common_vars.sh
index 4a48d2577a8..3d81a3fcc6c 100644
--- a/egs/babel/s5d/conf/common_vars.sh
+++ b/egs/babel/s5d/conf/common_vars.sh
@@ -12,6 +12,7 @@ cer=0
 
 #Declaring here to make the definition inside the language conf files more
 # transparent and nice
+declare -A train_kwlists
 declare -A dev10h_kwlists
 declare -A dev2h_kwlists
 declare -A evalpart1_kwlists
diff --git a/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf b/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf
index 08f849b7605..af1bbb132f7 100644
--- a/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf
+++ b/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf
@@ -3,7 +3,7 @@
 
 #speech corpora files location
 train_data_dir=/export/babel/data/104-pashto/release-current/conversational/training
-train_data_list=/export/babel/data/splits/Pashto_Babel104/train.FullLP.list
+train_data_list=./conf/lists/104-pashto/training.list
 train_nj=32
 
 #RADICAL DEV2H data files
@@ -22,7 +22,7 @@ dev2h_nj=18
 
 #Official DEV data files
 dev10h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
-dev10h_data_list=/export/babel/data/splits/Pashto_Babel104/dev.list
+dev10h_data_list=./conf/lists/104-pashto/dev.list
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel104b-v0.4bY_conv-dev/IARPA-babel104b-v0.4bY_conv-dev.stm
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel104b-v0.4bY_conv-dev.ecf.xml
 dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel104b-v0.4bY_conv-dev/IARPA-babel104b-v0.4bY_conv-dev.mitllfa3.rttm
diff --git a/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf b/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf
index 6889cb7eb37..d6ae1007ac9 100644
--- a/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf
+++ b/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf
@@ -5,7 +5,14 @@
 #speech corpora files location
 train_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/training
 train_data_list=/export/babel/data/splits/Turkish_Babel105/train.fullLP.list
-train_nj=32
+#train_nj=32
+train_ecf_file=./data/train/ecf.train.xml
+train_rttm_file=./exp/tri5/rttm
+train_kwlists=(
+                      [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel105b-v0.4_conv-dev.kwlist.xml
+                      [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel105b-v0.4_conv-dev.kwlist2.xml
+)
+train_nj=64
 
 #RADICAL DEV data files
 dev2h_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
diff --git a/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf b/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf
index 233cd81fffb..b1dd7f5b4f5 100644
--- a/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf
@@ -11,11 +11,12 @@ train_nj=32
 #Radical reduced DEV corpora files location
 dev2h_data_dir=/export/babel/data/305-guarani/IARPA-babel305b-v1.0b-build/BABEL_OP3_305/conversational/dev
 dev2h_data_list=./conf/lists/305-guarani//dev.2h.list
-dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.mitllfa3.rttm
-dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.scoring.ecf.xml
-dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.stm
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.mitllfa3.rttm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.scoring.ecf.xml
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -24,11 +25,12 @@ dev2h_subset_ecf=true
 #Official DEV corpora files location
 dev10h_data_dir=/export/babel/data/305-guarani/IARPA-babel305b-v1.0b-build/BABEL_OP3_305/conversational/dev
 dev10h_data_list=./conf/lists/305-guarani//dev.list
-dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.mitllfa3.rttm
-dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.scoring.ecf.xml
-dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.stm
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.mitllfa3.rttm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.scoring.ecf.xml
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf b/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf
index 87f82da6b49..15a0264de61 100644
--- a/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf b/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf
index 9668bd14e6b..8ae1b53eb2b 100644
--- a/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev.kwlist4.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev.kwlist4.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
@@ -42,5 +44,9 @@ unsup_nj=32
 lexicon_file=/export/babel/data/307-amharic/IARPA-babel307b-v1.0b-build/BABEL_OP3_307/conversational/reference_materials/lexicon.txt
 lexiconFlags="--romanized --oov <unk>"
 
+extlex_proxy_phone_beam=5
+extlex_proxy_phone_nbest=300
+extlex_proxy_beam=-1
+extlex_proxy_nbest=-1
 
 
diff --git a/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf b/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf
index 902ded164d2..aac78e77a80 100644
--- a/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf b/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf
index 0f176dc9396..d0f86207484 100644
--- a/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,9 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
-    [kwlist3]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist3.xml
-
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf b/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf
index 6dc95d74304..9096a21fdc4 100644
--- a/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev.kwlist4.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev.kwlist4.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
new file mode 100644
index 00000000000..4c36a8878fd
--- /dev/null
+++ b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
@@ -0,0 +1,78 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+
+#speech corpora files location
+train_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/training
+train_data_list=./conf/lists/404-georgian//training.list
+train_nj=32
+
+
+#Radical reduced DEV corpora files location
+dev2h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev2h_data_list=./conf/lists/404-georgian//dev.2h.list
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev2h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # dev2h_kwlists
+dev2h_nj=16
+dev2h_subset_ecf=true
+
+
+#Official DEV corpora files location
+dev10h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev10h_data_list=./conf/lists/404-georgian//dev.list
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev10h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # dev10h_kwlists
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/eval
+eval_data_list=./conf/lists/404-georgian//eval.list
+eval_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-eval.ecf.xml
+eval_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # eval_kwlists
+eval_nj=32
+
+
+#Shadow data files
+shadow_data_dir=(
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/eval
+) # shadow_data_dir
+shadow_data_list=(
+    ./conf/lists/404-georgian//dev.list
+    ./conf/lists/404-georgian//eval.list
+) # shadow_data_dir
+shadow_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+shadow_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+shadow_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+shadow_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # shadow_kwlists
+shadow_nj=32
+
+
+#Unsupervised dataset for FullLP condition
+unsup_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/untranscribed-training
+unsup_data_list=./conf/lists/404-georgian//untranscribed-training.list
+unsup_nj=32
+
+
+lexicon_file=
+lexiconFlags="--romanized --oov <unk>"
+
+
+
diff --git a/egs/babel/s5d/conf/lang/404-georgian.LLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.LLP.official.conf
new file mode 100644
index 00000000000..570bcab68ec
--- /dev/null
+++ b/egs/babel/s5d/conf/lang/404-georgian.LLP.official.conf
@@ -0,0 +1,54 @@
+# include common settings for fullLP systems.
+. conf/common.limitedLP || exit 1;
+
+
+#speech corpora files location
+train_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/training
+train_data_list=./conf/lists/404-georgian//sub-train.list
+train_nj=32
+
+
+#Radical reduced DEV corpora files location
+dev2h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev2h_data_list=./conf/lists/404-georgian//dev.2h.list
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev2h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist3.xml
+)  # dev2h_kwlists
+dev2h_nj=16
+dev2h_subset_ecf=true
+
+
+#Official DEV corpora files location
+dev10h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev10h_data_list=./conf/lists/404-georgian//dev.list
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev10h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist3.xml
+)  # dev10h_kwlists
+dev10h_nj=32
+
+
+#Unsupervised dataset for LimitedLP condition
+unsup_data_list=(
+    ./conf/lists/404-georgian//untranscribed-training.list
+    ./conf/lists/404-georgian//sub-train.untranscribed.list
+)  # unsup_data_list
+unsup_data_dir=(
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/untranscribed-training
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/training
+)  # unsup_data_dir
+unsup_nj=32
+
+
+lexicon_file=
+lexiconFlags="--romanized --oov <unk>"
+
+
+
diff --git a/egs/babel/s5d/conf/lists/404-georgian/dev.2h.list b/egs/babel/s5d/conf/lists/404-georgian/dev.2h.list
new file mode 100644
index 00000000000..a823552044c
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/dev.2h.list
@@ -0,0 +1,124 @@
+BABEL_OP3_404_10184_20141107_212406_inLine
+BABEL_OP3_404_10184_20141107_212406_outLine
+BABEL_OP3_404_12851_20141013_024620_inLine
+BABEL_OP3_404_12851_20141013_024620_outLine
+BABEL_OP3_404_16184_20141020_233508_inLine
+BABEL_OP3_404_16184_20141020_233508_outLine
+BABEL_OP3_404_17165_20141117_063008_inLine
+BABEL_OP3_404_17165_20141117_063008_outLine
+BABEL_OP3_404_17472_20141201_023731_inLine
+BABEL_OP3_404_17472_20141201_023731_outLine
+BABEL_OP3_404_18380_20141118_001754_inLine
+BABEL_OP3_404_18380_20141118_001754_outLine
+BABEL_OP3_404_18939_20141009_063127_inLine
+BABEL_OP3_404_18939_20141009_063127_outLine
+BABEL_OP3_404_22446_20141013_062554_inLine
+BABEL_OP3_404_22446_20141013_062554_outLine
+BABEL_OP3_404_22466_20141018_193129_inLine
+BABEL_OP3_404_22466_20141018_193129_outLine
+BABEL_OP3_404_22494_20141127_221208_inLine
+BABEL_OP3_404_22494_20141127_221208_outLine
+BABEL_OP3_404_22494_20141127_222057_inLine
+BABEL_OP3_404_22494_20141127_222057_outLine
+BABEL_OP3_404_23239_20141127_054155_inLine
+BABEL_OP3_404_23239_20141127_054155_outLine
+BABEL_OP3_404_24253_20150513_212152_inLine
+BABEL_OP3_404_24253_20150513_212152_outLine
+BABEL_OP3_404_24779_20150620_032949_inLine
+BABEL_OP3_404_24779_20150620_032949_outLine
+BABEL_OP3_404_26074_20141120_050650_inLine
+BABEL_OP3_404_26074_20141120_050650_outLine
+BABEL_OP3_404_28419_20141028_024104_inLine
+BABEL_OP3_404_28419_20141028_024104_outLine
+BABEL_OP3_404_33476_20141114_205102_inLine
+BABEL_OP3_404_33476_20141114_205102_outLine
+BABEL_OP3_404_34564_20141211_015413_inLine
+BABEL_OP3_404_34564_20141211_015413_outLine
+BABEL_OP3_404_35467_20141020_054030_inLine
+BABEL_OP3_404_35467_20141020_054030_outLine
+BABEL_OP3_404_38431_20141130_190122_inLine
+BABEL_OP3_404_38431_20141130_190122_outLine
+BABEL_OP3_404_41592_20141117_033328_inLine
+BABEL_OP3_404_41592_20141117_033328_outLine
+BABEL_OP3_404_41741_20141019_015552_inLine
+BABEL_OP3_404_41741_20141019_015552_outLine
+BABEL_OP3_404_42231_20141130_013425_inLine
+BABEL_OP3_404_42231_20141130_013425_outLine
+BABEL_OP3_404_42231_20141130_014628_inLine
+BABEL_OP3_404_42231_20141130_014628_outLine
+BABEL_OP3_404_42600_20141029_174857_inLine
+BABEL_OP3_404_42600_20141029_174857_outLine
+BABEL_OP3_404_44619_20141028_234639_inLine
+BABEL_OP3_404_44619_20141028_234639_outLine
+BABEL_OP3_404_46535_20150216_024618_inLine
+BABEL_OP3_404_46535_20150216_024618_outLine
+BABEL_OP3_404_46757_20141123_021510_inLine
+BABEL_OP3_404_46757_20141123_021510_outLine
+BABEL_OP3_404_47487_20141030_235808_inLine
+BABEL_OP3_404_47487_20141030_235808_outLine
+BABEL_OP3_404_47866_20150526_162411_inLine
+BABEL_OP3_404_47866_20150526_162411_outLine
+BABEL_OP3_404_47959_20141026_214447_inLine
+BABEL_OP3_404_47959_20141026_214447_outLine
+BABEL_OP3_404_51955_20141024_012212_inLine
+BABEL_OP3_404_51955_20141024_012212_outLine
+BABEL_OP3_404_51968_20141117_023015_inLine
+BABEL_OP3_404_51968_20141117_023015_outLine
+BABEL_OP3_404_52804_20141023_174815_inLine
+BABEL_OP3_404_52804_20141023_174815_outLine
+BABEL_OP3_404_54567_20141119_040337_inLine
+BABEL_OP3_404_54567_20141119_040337_outLine
+BABEL_OP3_404_56677_20141201_065523_inLine
+BABEL_OP3_404_56677_20141201_065523_outLine
+BABEL_OP3_404_56826_20141201_042429_inLine
+BABEL_OP3_404_56826_20141201_042429_outLine
+BABEL_OP3_404_58047_20141110_215330_inLine
+BABEL_OP3_404_58047_20141110_215330_outLine
+BABEL_OP3_404_58313_20141119_234202_inLine
+BABEL_OP3_404_58313_20141119_234202_outLine
+BABEL_OP3_404_59549_20141102_190355_inLine
+BABEL_OP3_404_59549_20141102_190355_outLine
+BABEL_OP3_404_60307_20150625_022621_inLine
+BABEL_OP3_404_60307_20150625_022621_outLine
+BABEL_OP3_404_61040_20141211_011552_inLine
+BABEL_OP3_404_61040_20141211_011552_outLine
+BABEL_OP3_404_61190_20141029_013447_inLine
+BABEL_OP3_404_61190_20141029_013447_outLine
+BABEL_OP3_404_64638_20141130_205157_inLine
+BABEL_OP3_404_64638_20141130_205157_outLine
+BABEL_OP3_404_66472_20141107_204602_inLine
+BABEL_OP3_404_66472_20141107_204602_outLine
+BABEL_OP3_404_66519_20141031_015751_inLine
+BABEL_OP3_404_66519_20141031_015751_outLine
+BABEL_OP3_404_67794_20141103_023323_inLine
+BABEL_OP3_404_67794_20141103_023323_outLine
+BABEL_OP3_404_73696_20150618_060036_inLine
+BABEL_OP3_404_73696_20150618_060036_outLine
+BABEL_OP3_404_73757_20141117_025704_inLine
+BABEL_OP3_404_73757_20141117_025704_outLine
+BABEL_OP3_404_74121_20141120_020705_inLine
+BABEL_OP3_404_74121_20141120_020705_outLine
+BABEL_OP3_404_80781_20141104_212234_inLine
+BABEL_OP3_404_80781_20141104_212234_outLine
+BABEL_OP3_404_80881_20141010_222135_inLine
+BABEL_OP3_404_80881_20141010_222135_outLine
+BABEL_OP3_404_81424_20141123_000421_inLine
+BABEL_OP3_404_81424_20141123_000421_outLine
+BABEL_OP3_404_87298_20141025_213601_inLine
+BABEL_OP3_404_87298_20141025_213601_outLine
+BABEL_OP3_404_87313_20141119_014632_inLine
+BABEL_OP3_404_87313_20141119_014632_outLine
+BABEL_OP3_404_87796_20141120_065537_inLine
+BABEL_OP3_404_87796_20141120_065537_outLine
+BABEL_OP3_404_87884_20141128_211555_inLine
+BABEL_OP3_404_87884_20141128_211555_outLine
+BABEL_OP3_404_88776_20141006_193621_inLine
+BABEL_OP3_404_88776_20141006_193621_outLine
+BABEL_OP3_404_91760_20150609_033824_inLine
+BABEL_OP3_404_91760_20150609_033824_outLine
+BABEL_OP3_404_91930_20150522_034521_inLine
+BABEL_OP3_404_91930_20150522_034521_outLine
+BABEL_OP3_404_92740_20141126_025242_inLine
+BABEL_OP3_404_92740_20141126_025242_outLine
+BABEL_OP3_404_97376_20141126_024552_inLine
+BABEL_OP3_404_97376_20141126_024552_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/dev.list b/egs/babel/s5d/conf/lists/404-georgian/dev.list
new file mode 100644
index 00000000000..a823552044c
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/dev.list
@@ -0,0 +1,124 @@
+BABEL_OP3_404_10184_20141107_212406_inLine
+BABEL_OP3_404_10184_20141107_212406_outLine
+BABEL_OP3_404_12851_20141013_024620_inLine
+BABEL_OP3_404_12851_20141013_024620_outLine
+BABEL_OP3_404_16184_20141020_233508_inLine
+BABEL_OP3_404_16184_20141020_233508_outLine
+BABEL_OP3_404_17165_20141117_063008_inLine
+BABEL_OP3_404_17165_20141117_063008_outLine
+BABEL_OP3_404_17472_20141201_023731_inLine
+BABEL_OP3_404_17472_20141201_023731_outLine
+BABEL_OP3_404_18380_20141118_001754_inLine
+BABEL_OP3_404_18380_20141118_001754_outLine
+BABEL_OP3_404_18939_20141009_063127_inLine
+BABEL_OP3_404_18939_20141009_063127_outLine
+BABEL_OP3_404_22446_20141013_062554_inLine
+BABEL_OP3_404_22446_20141013_062554_outLine
+BABEL_OP3_404_22466_20141018_193129_inLine
+BABEL_OP3_404_22466_20141018_193129_outLine
+BABEL_OP3_404_22494_20141127_221208_inLine
+BABEL_OP3_404_22494_20141127_221208_outLine
+BABEL_OP3_404_22494_20141127_222057_inLine
+BABEL_OP3_404_22494_20141127_222057_outLine
+BABEL_OP3_404_23239_20141127_054155_inLine
+BABEL_OP3_404_23239_20141127_054155_outLine
+BABEL_OP3_404_24253_20150513_212152_inLine
+BABEL_OP3_404_24253_20150513_212152_outLine
+BABEL_OP3_404_24779_20150620_032949_inLine
+BABEL_OP3_404_24779_20150620_032949_outLine
+BABEL_OP3_404_26074_20141120_050650_inLine
+BABEL_OP3_404_26074_20141120_050650_outLine
+BABEL_OP3_404_28419_20141028_024104_inLine
+BABEL_OP3_404_28419_20141028_024104_outLine
+BABEL_OP3_404_33476_20141114_205102_inLine
+BABEL_OP3_404_33476_20141114_205102_outLine
+BABEL_OP3_404_34564_20141211_015413_inLine
+BABEL_OP3_404_34564_20141211_015413_outLine
+BABEL_OP3_404_35467_20141020_054030_inLine
+BABEL_OP3_404_35467_20141020_054030_outLine
+BABEL_OP3_404_38431_20141130_190122_inLine
+BABEL_OP3_404_38431_20141130_190122_outLine
+BABEL_OP3_404_41592_20141117_033328_inLine
+BABEL_OP3_404_41592_20141117_033328_outLine
+BABEL_OP3_404_41741_20141019_015552_inLine
+BABEL_OP3_404_41741_20141019_015552_outLine
+BABEL_OP3_404_42231_20141130_013425_inLine
+BABEL_OP3_404_42231_20141130_013425_outLine
+BABEL_OP3_404_42231_20141130_014628_inLine
+BABEL_OP3_404_42231_20141130_014628_outLine
+BABEL_OP3_404_42600_20141029_174857_inLine
+BABEL_OP3_404_42600_20141029_174857_outLine
+BABEL_OP3_404_44619_20141028_234639_inLine
+BABEL_OP3_404_44619_20141028_234639_outLine
+BABEL_OP3_404_46535_20150216_024618_inLine
+BABEL_OP3_404_46535_20150216_024618_outLine
+BABEL_OP3_404_46757_20141123_021510_inLine
+BABEL_OP3_404_46757_20141123_021510_outLine
+BABEL_OP3_404_47487_20141030_235808_inLine
+BABEL_OP3_404_47487_20141030_235808_outLine
+BABEL_OP3_404_47866_20150526_162411_inLine
+BABEL_OP3_404_47866_20150526_162411_outLine
+BABEL_OP3_404_47959_20141026_214447_inLine
+BABEL_OP3_404_47959_20141026_214447_outLine
+BABEL_OP3_404_51955_20141024_012212_inLine
+BABEL_OP3_404_51955_20141024_012212_outLine
+BABEL_OP3_404_51968_20141117_023015_inLine
+BABEL_OP3_404_51968_20141117_023015_outLine
+BABEL_OP3_404_52804_20141023_174815_inLine
+BABEL_OP3_404_52804_20141023_174815_outLine
+BABEL_OP3_404_54567_20141119_040337_inLine
+BABEL_OP3_404_54567_20141119_040337_outLine
+BABEL_OP3_404_56677_20141201_065523_inLine
+BABEL_OP3_404_56677_20141201_065523_outLine
+BABEL_OP3_404_56826_20141201_042429_inLine
+BABEL_OP3_404_56826_20141201_042429_outLine
+BABEL_OP3_404_58047_20141110_215330_inLine
+BABEL_OP3_404_58047_20141110_215330_outLine
+BABEL_OP3_404_58313_20141119_234202_inLine
+BABEL_OP3_404_58313_20141119_234202_outLine
+BABEL_OP3_404_59549_20141102_190355_inLine
+BABEL_OP3_404_59549_20141102_190355_outLine
+BABEL_OP3_404_60307_20150625_022621_inLine
+BABEL_OP3_404_60307_20150625_022621_outLine
+BABEL_OP3_404_61040_20141211_011552_inLine
+BABEL_OP3_404_61040_20141211_011552_outLine
+BABEL_OP3_404_61190_20141029_013447_inLine
+BABEL_OP3_404_61190_20141029_013447_outLine
+BABEL_OP3_404_64638_20141130_205157_inLine
+BABEL_OP3_404_64638_20141130_205157_outLine
+BABEL_OP3_404_66472_20141107_204602_inLine
+BABEL_OP3_404_66472_20141107_204602_outLine
+BABEL_OP3_404_66519_20141031_015751_inLine
+BABEL_OP3_404_66519_20141031_015751_outLine
+BABEL_OP3_404_67794_20141103_023323_inLine
+BABEL_OP3_404_67794_20141103_023323_outLine
+BABEL_OP3_404_73696_20150618_060036_inLine
+BABEL_OP3_404_73696_20150618_060036_outLine
+BABEL_OP3_404_73757_20141117_025704_inLine
+BABEL_OP3_404_73757_20141117_025704_outLine
+BABEL_OP3_404_74121_20141120_020705_inLine
+BABEL_OP3_404_74121_20141120_020705_outLine
+BABEL_OP3_404_80781_20141104_212234_inLine
+BABEL_OP3_404_80781_20141104_212234_outLine
+BABEL_OP3_404_80881_20141010_222135_inLine
+BABEL_OP3_404_80881_20141010_222135_outLine
+BABEL_OP3_404_81424_20141123_000421_inLine
+BABEL_OP3_404_81424_20141123_000421_outLine
+BABEL_OP3_404_87298_20141025_213601_inLine
+BABEL_OP3_404_87298_20141025_213601_outLine
+BABEL_OP3_404_87313_20141119_014632_inLine
+BABEL_OP3_404_87313_20141119_014632_outLine
+BABEL_OP3_404_87796_20141120_065537_inLine
+BABEL_OP3_404_87796_20141120_065537_outLine
+BABEL_OP3_404_87884_20141128_211555_inLine
+BABEL_OP3_404_87884_20141128_211555_outLine
+BABEL_OP3_404_88776_20141006_193621_inLine
+BABEL_OP3_404_88776_20141006_193621_outLine
+BABEL_OP3_404_91760_20150609_033824_inLine
+BABEL_OP3_404_91760_20150609_033824_outLine
+BABEL_OP3_404_91930_20150522_034521_inLine
+BABEL_OP3_404_91930_20150522_034521_outLine
+BABEL_OP3_404_92740_20141126_025242_inLine
+BABEL_OP3_404_92740_20141126_025242_outLine
+BABEL_OP3_404_97376_20141126_024552_inLine
+BABEL_OP3_404_97376_20141126_024552_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/eval.list b/egs/babel/s5d/conf/lists/404-georgian/eval.list
new file mode 100644
index 00000000000..d197b90ee2f
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/eval.list
@@ -0,0 +1,956 @@
+BABEL_OP3_404_10036_20141030_200515_inLine
+BABEL_OP3_404_10036_20141030_200515_outLine
+BABEL_OP3_404_10188_20141021_043537_inLine
+BABEL_OP3_404_10188_20141021_043537_outLine
+BABEL_OP3_404_10319_20141015_010220_inLine
+BABEL_OP3_404_10319_20141015_010220_outLine
+BABEL_OP3_404_10319_20141015_011118_inLine
+BABEL_OP3_404_10319_20141015_011118_outLine
+BABEL_OP3_404_10482_20141130_013900_inLine
+BABEL_OP3_404_10482_20141130_013900_outLine
+BABEL_OP3_404_10524_20150518_002415_inLine
+BABEL_OP3_404_10524_20150518_002415_outLine
+BABEL_OP3_404_10901_20141120_172058_inLine
+BABEL_OP3_404_10901_20141120_172058_outLine
+BABEL_OP3_404_10966_20141027_000701_inLine
+BABEL_OP3_404_10966_20141027_000701_outLine
+BABEL_OP3_404_11419_20150212_050835_inLine
+BABEL_OP3_404_11419_20150212_050835_outLine
+BABEL_OP3_404_11419_20150212_051550_inLine
+BABEL_OP3_404_11419_20150212_051550_outLine
+BABEL_OP3_404_11581_20141110_223927_inLine
+BABEL_OP3_404_11581_20141110_223927_outLine
+BABEL_OP3_404_11797_20141019_195244_inLine
+BABEL_OP3_404_11797_20141019_195244_outLine
+BABEL_OP3_404_12321_20141211_055837_inLine
+BABEL_OP3_404_12321_20141211_055837_outLine
+BABEL_OP3_404_13040_20141024_004921_inLine
+BABEL_OP3_404_13040_20141024_004921_outLine
+BABEL_OP3_404_13427_20141107_220103_inLine
+BABEL_OP3_404_13427_20141107_220103_outLine
+BABEL_OP3_404_13483_20141128_002800_inLine
+BABEL_OP3_404_13483_20141128_002800_outLine
+BABEL_OP3_404_13490_20141118_023408_inLine
+BABEL_OP3_404_13490_20141118_023408_outLine
+BABEL_OP3_404_13561_20141115_003843_inLine
+BABEL_OP3_404_13561_20141115_003843_outLine
+BABEL_OP3_404_13586_20141106_180057_inLine
+BABEL_OP3_404_13586_20141106_180057_outLine
+BABEL_OP3_404_13744_20141021_043037_inLine
+BABEL_OP3_404_13744_20141021_043037_outLine
+BABEL_OP3_404_13792_20141011_010111_inLine
+BABEL_OP3_404_13792_20141011_010111_outLine
+BABEL_OP3_404_14097_20150211_010746_inLine
+BABEL_OP3_404_14097_20150211_010746_outLine
+BABEL_OP3_404_14179_20141201_063636_inLine
+BABEL_OP3_404_14179_20141201_063636_outLine
+BABEL_OP3_404_14228_20141130_062059_inLine
+BABEL_OP3_404_14228_20141130_062059_outLine
+BABEL_OP3_404_14560_20141201_073709_inLine
+BABEL_OP3_404_14560_20141201_073709_outLine
+BABEL_OP3_404_14719_20141201_014614_inLine
+BABEL_OP3_404_14719_20141201_014614_outLine
+BABEL_OP3_404_14725_20141013_005356_inLine
+BABEL_OP3_404_14725_20141013_005356_outLine
+BABEL_OP3_404_15163_20141115_035641_inLine
+BABEL_OP3_404_15163_20141115_035641_outLine
+BABEL_OP3_404_15322_20150512_231817_inLine
+BABEL_OP3_404_15322_20150512_231817_outLine
+BABEL_OP3_404_15324_20141120_031528_inLine
+BABEL_OP3_404_15324_20141120_031528_outLine
+BABEL_OP3_404_15702_20141129_051812_inLine
+BABEL_OP3_404_15702_20141129_051812_outLine
+BABEL_OP3_404_15730_20141021_055606_inLine
+BABEL_OP3_404_15730_20141021_055606_outLine
+BABEL_OP3_404_15926_20141124_004339_inLine
+BABEL_OP3_404_15926_20141124_004339_outLine
+BABEL_OP3_404_15926_20141124_005513_inLine
+BABEL_OP3_404_15926_20141124_005513_outLine
+BABEL_OP3_404_16056_20141009_005123_inLine
+BABEL_OP3_404_16056_20141009_005123_outLine
+BABEL_OP3_404_16787_20141120_174312_inLine
+BABEL_OP3_404_16787_20141120_174312_outLine
+BABEL_OP3_404_16800_20141212_184132_inLine
+BABEL_OP3_404_16800_20141212_184132_outLine
+BABEL_OP3_404_16800_20141212_185849_inLine
+BABEL_OP3_404_16800_20141212_185849_outLine
+BABEL_OP3_404_16886_20141117_002313_inLine
+BABEL_OP3_404_16886_20141117_002313_outLine
+BABEL_OP3_404_16886_20141117_003801_inLine
+BABEL_OP3_404_16886_20141117_003801_outLine
+BABEL_OP3_404_16924_20141201_020122_inLine
+BABEL_OP3_404_16924_20141201_020122_outLine
+BABEL_OP3_404_16938_20141118_045730_inLine
+BABEL_OP3_404_16938_20141118_045730_outLine
+BABEL_OP3_404_17032_20141128_030249_inLine
+BABEL_OP3_404_17032_20141128_030249_outLine
+BABEL_OP3_404_17440_20141127_041844_inLine
+BABEL_OP3_404_17440_20141127_041844_outLine
+BABEL_OP3_404_17496_20141130_022805_inLine
+BABEL_OP3_404_17496_20141130_022805_outLine
+BABEL_OP3_404_17751_20150611_030539_inLine
+BABEL_OP3_404_17751_20150611_030539_outLine
+BABEL_OP3_404_17881_20150524_231317_inLine
+BABEL_OP3_404_17881_20150524_231317_outLine
+BABEL_OP3_404_17914_20150526_054931_inLine
+BABEL_OP3_404_17914_20150526_054931_outLine
+BABEL_OP3_404_18280_20150213_011322_inLine
+BABEL_OP3_404_18280_20150213_011322_outLine
+BABEL_OP3_404_18370_20150210_194727_inLine
+BABEL_OP3_404_18370_20150210_194727_outLine
+BABEL_OP3_404_18924_20141110_211055_inLine
+BABEL_OP3_404_18924_20141110_211055_outLine
+BABEL_OP3_404_19101_20141113_042102_inLine
+BABEL_OP3_404_19101_20141113_042102_outLine
+BABEL_OP3_404_19545_20141107_223152_inLine
+BABEL_OP3_404_19545_20141107_223152_outLine
+BABEL_OP3_404_19621_20141201_041129_inLine
+BABEL_OP3_404_19621_20141201_041129_outLine
+BABEL_OP3_404_19672_20141124_015046_inLine
+BABEL_OP3_404_19672_20141124_015046_outLine
+BABEL_OP3_404_19722_20141006_033717_inLine
+BABEL_OP3_404_19722_20141006_033717_outLine
+BABEL_OP3_404_19782_20141201_231608_inLine
+BABEL_OP3_404_19782_20141201_231608_outLine
+BABEL_OP3_404_19818_20141124_044516_inLine
+BABEL_OP3_404_19818_20141124_044516_outLine
+BABEL_OP3_404_20367_20150618_055644_inLine
+BABEL_OP3_404_20367_20150618_055644_outLine
+BABEL_OP3_404_20682_20141211_044056_inLine
+BABEL_OP3_404_20682_20141211_044056_outLine
+BABEL_OP3_404_20682_20141211_045257_inLine
+BABEL_OP3_404_20682_20141211_045257_outLine
+BABEL_OP3_404_20738_20150503_191409_inLine
+BABEL_OP3_404_20738_20150503_191409_outLine
+BABEL_OP3_404_20768_20141207_081305_inLine
+BABEL_OP3_404_20768_20141207_081305_outLine
+BABEL_OP3_404_20800_20141022_192312_inLine
+BABEL_OP3_404_20800_20141022_192312_outLine
+BABEL_OP3_404_20916_20141006_192451_inLine
+BABEL_OP3_404_20916_20141006_192451_outLine
+BABEL_OP3_404_21029_20141105_033902_inLine
+BABEL_OP3_404_21029_20141105_033902_outLine
+BABEL_OP3_404_21206_20141024_194128_inLine
+BABEL_OP3_404_21206_20141024_194128_outLine
+BABEL_OP3_404_21624_20150525_034841_inLine
+BABEL_OP3_404_21624_20150525_034841_outLine
+BABEL_OP3_404_21794_20141115_220258_inLine
+BABEL_OP3_404_21794_20141115_220258_outLine
+BABEL_OP3_404_22021_20150217_213437_inLine
+BABEL_OP3_404_22021_20150217_213437_outLine
+BABEL_OP3_404_22021_20150220_194248_inLine
+BABEL_OP3_404_22021_20150220_194248_outLine
+BABEL_OP3_404_22034_20150211_165126_inLine
+BABEL_OP3_404_22034_20150211_165126_outLine
+BABEL_OP3_404_22170_20150528_002541_inLine
+BABEL_OP3_404_22170_20150528_002541_outLine
+BABEL_OP3_404_22216_20141020_051333_inLine
+BABEL_OP3_404_22216_20141020_051333_outLine
+BABEL_OP3_404_22321_20141019_214812_inLine
+BABEL_OP3_404_22321_20141019_214812_outLine
+BABEL_OP3_404_22612_20141201_080517_inLine
+BABEL_OP3_404_22612_20141201_080517_outLine
+BABEL_OP3_404_22641_20141021_165119_inLine
+BABEL_OP3_404_22641_20141021_165119_outLine
+BABEL_OP3_404_22965_20141101_192617_inLine
+BABEL_OP3_404_22965_20141101_192617_outLine
+BABEL_OP3_404_23006_20141026_211155_inLine
+BABEL_OP3_404_23006_20141026_211155_outLine
+BABEL_OP3_404_23092_20141129_005335_inLine
+BABEL_OP3_404_23092_20141129_005335_outLine
+BABEL_OP3_404_23153_20141118_015224_inLine
+BABEL_OP3_404_23153_20141118_015224_outLine
+BABEL_OP3_404_23628_20141027_170345_inLine
+BABEL_OP3_404_23628_20141027_170345_outLine
+BABEL_OP3_404_24017_20141211_021947_inLine
+BABEL_OP3_404_24017_20141211_021947_outLine
+BABEL_OP3_404_24290_20150515_164252_inLine
+BABEL_OP3_404_24290_20150515_164252_outLine
+BABEL_OP3_404_24569_20141130_214924_inLine
+BABEL_OP3_404_24569_20141130_214924_outLine
+BABEL_OP3_404_24605_20141013_043620_inLine
+BABEL_OP3_404_24605_20141013_043620_outLine
+BABEL_OP3_404_25698_20150611_021501_inLine
+BABEL_OP3_404_25698_20150611_021501_outLine
+BABEL_OP3_404_25767_20141009_211814_inLine
+BABEL_OP3_404_25767_20141009_211814_outLine
+BABEL_OP3_404_26206_20141128_031139_inLine
+BABEL_OP3_404_26206_20141128_031139_outLine
+BABEL_OP3_404_26999_20141130_004320_inLine
+BABEL_OP3_404_26999_20141130_004320_outLine
+BABEL_OP3_404_27082_20141119_041436_inLine
+BABEL_OP3_404_27082_20141119_041436_outLine
+BABEL_OP3_404_27125_20141007_032335_inLine
+BABEL_OP3_404_27125_20141007_032335_outLine
+BABEL_OP3_404_27478_20150514_205232_inLine
+BABEL_OP3_404_27478_20150514_205232_outLine
+BABEL_OP3_404_28422_20141124_055809_inLine
+BABEL_OP3_404_28422_20141124_055809_outLine
+BABEL_OP3_404_28606_20141127_011719_inLine
+BABEL_OP3_404_28606_20141127_011719_outLine
+BABEL_OP3_404_28775_20141028_193907_inLine
+BABEL_OP3_404_28775_20141028_193907_outLine
+BABEL_OP3_404_29023_20141024_225827_inLine
+BABEL_OP3_404_29023_20141024_225827_outLine
+BABEL_OP3_404_29072_20141128_023212_inLine
+BABEL_OP3_404_29072_20141128_023212_outLine
+BABEL_OP3_404_29135_20141022_182050_inLine
+BABEL_OP3_404_29135_20141022_182050_outLine
+BABEL_OP3_404_29168_20141023_013832_inLine
+BABEL_OP3_404_29168_20141023_013832_outLine
+BABEL_OP3_404_29352_20150618_035033_inLine
+BABEL_OP3_404_29352_20150618_035033_outLine
+BABEL_OP3_404_29352_20150618_041025_inLine
+BABEL_OP3_404_29352_20150618_041025_outLine
+BABEL_OP3_404_29685_20141103_223309_inLine
+BABEL_OP3_404_29685_20141103_223309_outLine
+BABEL_OP3_404_29765_20150616_155830_inLine
+BABEL_OP3_404_29765_20150616_155830_outLine
+BABEL_OP3_404_30013_20141127_211853_inLine
+BABEL_OP3_404_30013_20141127_211853_outLine
+BABEL_OP3_404_30058_20150514_024957_inLine
+BABEL_OP3_404_30058_20150514_024957_outLine
+BABEL_OP3_404_30180_20141118_011806_inLine
+BABEL_OP3_404_30180_20141118_011806_outLine
+BABEL_OP3_404_30253_20141201_051926_inLine
+BABEL_OP3_404_30253_20141201_051926_outLine
+BABEL_OP3_404_30395_20141106_185545_inLine
+BABEL_OP3_404_30395_20141106_185545_outLine
+BABEL_OP3_404_31039_20150217_050120_inLine
+BABEL_OP3_404_31039_20150217_050120_outLine
+BABEL_OP3_404_31039_20150217_051317_inLine
+BABEL_OP3_404_31039_20150217_051317_outLine
+BABEL_OP3_404_31074_20150121_022649_inLine
+BABEL_OP3_404_31074_20150121_022649_outLine
+BABEL_OP3_404_31184_20141118_183536_inLine
+BABEL_OP3_404_31184_20141118_183536_outLine
+BABEL_OP3_404_31490_20141022_200135_inLine
+BABEL_OP3_404_31490_20141022_200135_outLine
+BABEL_OP3_404_31583_20141130_004731_inLine
+BABEL_OP3_404_31583_20141130_004731_outLine
+BABEL_OP3_404_31628_20141202_000346_inLine
+BABEL_OP3_404_31628_20141202_000346_outLine
+BABEL_OP3_404_32097_20141006_221638_inLine
+BABEL_OP3_404_32097_20141006_221638_outLine
+BABEL_OP3_404_32244_20150609_043200_inLine
+BABEL_OP3_404_32244_20150609_043200_outLine
+BABEL_OP3_404_32301_20141126_204138_inLine
+BABEL_OP3_404_32301_20141126_204138_outLine
+BABEL_OP3_404_33111_20150528_004829_inLine
+BABEL_OP3_404_33111_20150528_004829_outLine
+BABEL_OP3_404_33251_20141119_205146_inLine
+BABEL_OP3_404_33251_20141119_205146_outLine
+BABEL_OP3_404_33273_20141105_213401_inLine
+BABEL_OP3_404_33273_20141105_213401_outLine
+BABEL_OP3_404_33497_20141119_051436_inLine
+BABEL_OP3_404_33497_20141119_051436_outLine
+BABEL_OP3_404_33635_20141106_005750_inLine
+BABEL_OP3_404_33635_20141106_005750_outLine
+BABEL_OP3_404_33672_20141014_004055_inLine
+BABEL_OP3_404_33672_20141014_004055_outLine
+BABEL_OP3_404_33672_20141014_005233_inLine
+BABEL_OP3_404_33672_20141014_005233_outLine
+BABEL_OP3_404_33951_20141119_072531_inLine
+BABEL_OP3_404_33951_20141119_072531_outLine
+BABEL_OP3_404_34197_20141018_201528_inLine
+BABEL_OP3_404_34197_20141018_201528_outLine
+BABEL_OP3_404_34336_20141027_211535_inLine
+BABEL_OP3_404_34336_20141027_211535_outLine
+BABEL_OP3_404_34477_20141027_184645_inLine
+BABEL_OP3_404_34477_20141027_184645_outLine
+BABEL_OP3_404_34903_20141124_020719_inLine
+BABEL_OP3_404_34903_20141124_020719_outLine
+BABEL_OP3_404_35139_20141023_224322_inLine
+BABEL_OP3_404_35139_20141023_224322_outLine
+BABEL_OP3_404_35202_20141128_053756_inLine
+BABEL_OP3_404_35202_20141128_053756_outLine
+BABEL_OP3_404_35885_20150518_015426_inLine
+BABEL_OP3_404_35885_20150518_015426_outLine
+BABEL_OP3_404_36293_20141006_004659_inLine
+BABEL_OP3_404_36293_20141006_004659_outLine
+BABEL_OP3_404_36341_20141021_045218_inLine
+BABEL_OP3_404_36341_20141021_045218_outLine
+BABEL_OP3_404_36669_20141116_050542_inLine
+BABEL_OP3_404_36669_20141116_050542_outLine
+BABEL_OP3_404_36894_20141009_013557_inLine
+BABEL_OP3_404_36894_20141009_013557_outLine
+BABEL_OP3_404_36990_20141117_041052_inLine
+BABEL_OP3_404_36990_20141117_041052_outLine
+BABEL_OP3_404_37068_20150212_050250_inLine
+BABEL_OP3_404_37068_20150212_050250_outLine
+BABEL_OP3_404_37285_20141128_060822_inLine
+BABEL_OP3_404_37285_20141128_060822_outLine
+BABEL_OP3_404_37684_20150211_031551_inLine
+BABEL_OP3_404_37684_20150211_031551_outLine
+BABEL_OP3_404_38076_20141129_030136_inLine
+BABEL_OP3_404_38076_20141129_030136_outLine
+BABEL_OP3_404_38689_20141128_235841_inLine
+BABEL_OP3_404_38689_20141128_235841_outLine
+BABEL_OP3_404_38741_20141028_190310_inLine
+BABEL_OP3_404_38741_20141028_190310_outLine
+BABEL_OP3_404_38750_20141130_052516_inLine
+BABEL_OP3_404_38750_20141130_052516_outLine
+BABEL_OP3_404_38878_20141118_224023_inLine
+BABEL_OP3_404_38878_20141118_224023_outLine
+BABEL_OP3_404_39006_20150617_032943_inLine
+BABEL_OP3_404_39006_20150617_032943_outLine
+BABEL_OP3_404_39159_20141021_033733_inLine
+BABEL_OP3_404_39159_20141021_033733_outLine
+BABEL_OP3_404_39848_20141113_234103_inLine
+BABEL_OP3_404_39848_20141113_234103_outLine
+BABEL_OP3_404_40565_20141126_191549_inLine
+BABEL_OP3_404_40565_20141126_191549_outLine
+BABEL_OP3_404_41038_20141201_070557_inLine
+BABEL_OP3_404_41038_20141201_070557_outLine
+BABEL_OP3_404_41174_20141117_033354_inLine
+BABEL_OP3_404_41174_20141117_033354_outLine
+BABEL_OP3_404_41442_20141201_065524_inLine
+BABEL_OP3_404_41442_20141201_065524_outLine
+BABEL_OP3_404_41469_20141015_041032_inLine
+BABEL_OP3_404_41469_20141015_041032_outLine
+BABEL_OP3_404_41493_20141007_192601_inLine
+BABEL_OP3_404_41493_20141007_192601_outLine
+BABEL_OP3_404_41618_20141114_232533_inLine
+BABEL_OP3_404_41618_20141114_232533_outLine
+BABEL_OP3_404_41890_20150516_214915_inLine
+BABEL_OP3_404_41890_20150516_214915_outLine
+BABEL_OP3_404_42146_20150524_225524_inLine
+BABEL_OP3_404_42146_20150524_225524_outLine
+BABEL_OP3_404_42434_20141101_015900_inLine
+BABEL_OP3_404_42434_20141101_015900_outLine
+BABEL_OP3_404_42718_20150514_042601_inLine
+BABEL_OP3_404_42718_20150514_042601_outLine
+BABEL_OP3_404_42771_20141119_032738_inLine
+BABEL_OP3_404_42771_20141119_032738_outLine
+BABEL_OP3_404_42942_20141105_231330_inLine
+BABEL_OP3_404_42942_20141105_231330_outLine
+BABEL_OP3_404_42991_20141201_174138_inLine
+BABEL_OP3_404_42991_20141201_174138_outLine
+BABEL_OP3_404_43115_20150518_051249_inLine
+BABEL_OP3_404_43115_20150518_051249_outLine
+BABEL_OP3_404_43285_20141127_224948_inLine
+BABEL_OP3_404_43285_20141127_224948_outLine
+BABEL_OP3_404_43286_20141011_233252_inLine
+BABEL_OP3_404_43286_20141011_233252_outLine
+BABEL_OP3_404_43646_20141011_031534_inLine
+BABEL_OP3_404_43646_20141011_031534_outLine
+BABEL_OP3_404_43784_20141101_215816_inLine
+BABEL_OP3_404_43784_20141101_215816_outLine
+BABEL_OP3_404_43784_20141101_220445_inLine
+BABEL_OP3_404_43784_20141101_220445_outLine
+BABEL_OP3_404_43784_20141101_222312_inLine
+BABEL_OP3_404_43784_20141101_222312_outLine
+BABEL_OP3_404_43788_20141125_190621_inLine
+BABEL_OP3_404_43788_20141125_190621_outLine
+BABEL_OP3_404_43920_20141128_232903_inLine
+BABEL_OP3_404_43920_20141128_232903_outLine
+BABEL_OP3_404_44255_20150525_073716_inLine
+BABEL_OP3_404_44255_20150525_073716_outLine
+BABEL_OP3_404_44420_20141025_211032_inLine
+BABEL_OP3_404_44420_20141025_211032_outLine
+BABEL_OP3_404_44531_20150527_015805_inLine
+BABEL_OP3_404_44531_20150527_015805_outLine
+BABEL_OP3_404_44709_20141126_024811_inLine
+BABEL_OP3_404_44709_20141126_024811_outLine
+BABEL_OP3_404_44868_20141123_032254_inLine
+BABEL_OP3_404_44868_20141123_032254_outLine
+BABEL_OP3_404_45642_20141011_233950_inLine
+BABEL_OP3_404_45642_20141011_233950_outLine
+BABEL_OP3_404_45770_20141009_185730_inLine
+BABEL_OP3_404_45770_20141009_185730_outLine
+BABEL_OP3_404_45777_20141028_195713_inLine
+BABEL_OP3_404_45777_20141028_195713_outLine
+BABEL_OP3_404_45843_20141124_042608_inLine
+BABEL_OP3_404_45843_20141124_042608_outLine
+BABEL_OP3_404_46008_20150525_024936_inLine
+BABEL_OP3_404_46008_20150525_024936_outLine
+BABEL_OP3_404_46261_20141117_200301_inLine
+BABEL_OP3_404_46261_20141117_200301_outLine
+BABEL_OP3_404_46389_20150216_043700_inLine
+BABEL_OP3_404_46389_20150216_043700_outLine
+BABEL_OP3_404_46558_20141020_013256_inLine
+BABEL_OP3_404_46558_20141020_013256_outLine
+BABEL_OP3_404_46589_20141126_010932_inLine
+BABEL_OP3_404_46589_20141126_010932_outLine
+BABEL_OP3_404_46702_20141021_004925_inLine
+BABEL_OP3_404_46702_20141021_004925_outLine
+BABEL_OP3_404_47110_20150211_041423_inLine
+BABEL_OP3_404_47110_20150211_041423_outLine
+BABEL_OP3_404_47186_20141130_032126_inLine
+BABEL_OP3_404_47186_20141130_032126_outLine
+BABEL_OP3_404_47215_20141016_012848_inLine
+BABEL_OP3_404_47215_20141016_012848_outLine
+BABEL_OP3_404_47283_20141105_063730_inLine
+BABEL_OP3_404_47283_20141105_063730_outLine
+BABEL_OP3_404_47451_20141201_044107_inLine
+BABEL_OP3_404_47451_20141201_044107_outLine
+BABEL_OP3_404_47451_20141201_045923_inLine
+BABEL_OP3_404_47451_20141201_045923_outLine
+BABEL_OP3_404_47878_20141115_030044_inLine
+BABEL_OP3_404_47878_20141115_030044_outLine
+BABEL_OP3_404_48789_20141130_013950_inLine
+BABEL_OP3_404_48789_20141130_013950_outLine
+BABEL_OP3_404_49001_20141102_054949_inLine
+BABEL_OP3_404_49001_20141102_054949_outLine
+BABEL_OP3_404_49216_20141023_021720_inLine
+BABEL_OP3_404_49216_20141023_021720_outLine
+BABEL_OP3_404_49287_20141201_003931_inLine
+BABEL_OP3_404_49287_20141201_003931_outLine
+BABEL_OP3_404_49502_20141012_055001_inLine
+BABEL_OP3_404_49502_20141012_055001_outLine
+BABEL_OP3_404_49637_20141006_052951_inLine
+BABEL_OP3_404_49637_20141006_052951_outLine
+BABEL_OP3_404_50090_20141119_215921_inLine
+BABEL_OP3_404_50090_20141119_215921_outLine
+BABEL_OP3_404_50427_20141108_184045_inLine
+BABEL_OP3_404_50427_20141108_184045_outLine
+BABEL_OP3_404_50630_20141123_224108_inLine
+BABEL_OP3_404_50630_20141123_224108_outLine
+BABEL_OP3_404_50681_20141119_074034_inLine
+BABEL_OP3_404_50681_20141119_074034_outLine
+BABEL_OP3_404_50726_20141021_005526_inLine
+BABEL_OP3_404_50726_20141021_005526_outLine
+BABEL_OP3_404_50958_20141118_184358_inLine
+BABEL_OP3_404_50958_20141118_184358_outLine
+BABEL_OP3_404_50958_20141118_185604_inLine
+BABEL_OP3_404_50958_20141118_185604_outLine
+BABEL_OP3_404_50962_20141107_060744_inLine
+BABEL_OP3_404_50962_20141107_060744_outLine
+BABEL_OP3_404_51407_20141117_062029_inLine
+BABEL_OP3_404_51407_20141117_062029_outLine
+BABEL_OP3_404_51611_20141022_024919_inLine
+BABEL_OP3_404_51611_20141022_024919_outLine
+BABEL_OP3_404_51819_20141126_211917_inLine
+BABEL_OP3_404_51819_20141126_211917_outLine
+BABEL_OP3_404_52272_20141006_031940_inLine
+BABEL_OP3_404_52272_20141006_031940_outLine
+BABEL_OP3_404_52438_20141104_034612_inLine
+BABEL_OP3_404_52438_20141104_034612_outLine
+BABEL_OP3_404_52442_20141109_004908_inLine
+BABEL_OP3_404_52442_20141109_004908_outLine
+BABEL_OP3_404_52614_20150503_200805_inLine
+BABEL_OP3_404_52614_20150503_200805_outLine
+BABEL_OP3_404_52694_20141121_043410_inLine
+BABEL_OP3_404_52694_20141121_043410_outLine
+BABEL_OP3_404_52717_20141014_234034_inLine
+BABEL_OP3_404_52717_20141014_234034_outLine
+BABEL_OP3_404_52818_20141130_231525_inLine
+BABEL_OP3_404_52818_20141130_231525_outLine
+BABEL_OP3_404_52932_20141101_234724_inLine
+BABEL_OP3_404_52932_20141101_234724_outLine
+BABEL_OP3_404_53419_20141201_030819_inLine
+BABEL_OP3_404_53419_20141201_030819_outLine
+BABEL_OP3_404_53842_20141119_044935_inLine
+BABEL_OP3_404_53842_20141119_044935_outLine
+BABEL_OP3_404_54074_20141129_060147_inLine
+BABEL_OP3_404_54074_20141129_060147_outLine
+BABEL_OP3_404_54162_20141119_032442_inLine
+BABEL_OP3_404_54162_20141119_032442_outLine
+BABEL_OP3_404_54390_20141028_230702_inLine
+BABEL_OP3_404_54390_20141028_230702_outLine
+BABEL_OP3_404_54530_20141130_011651_inLine
+BABEL_OP3_404_54530_20141130_011651_outLine
+BABEL_OP3_404_54697_20141201_053854_inLine
+BABEL_OP3_404_54697_20141201_053854_outLine
+BABEL_OP3_404_54953_20141115_022411_inLine
+BABEL_OP3_404_54953_20141115_022411_outLine
+BABEL_OP3_404_55742_20141102_071943_inLine
+BABEL_OP3_404_55742_20141102_071943_outLine
+BABEL_OP3_404_55818_20141014_062259_inLine
+BABEL_OP3_404_55818_20141014_062259_outLine
+BABEL_OP3_404_55950_20150502_234657_inLine
+BABEL_OP3_404_55950_20150502_234657_outLine
+BABEL_OP3_404_55968_20141009_231223_inLine
+BABEL_OP3_404_55968_20141009_231223_outLine
+BABEL_OP3_404_56090_20141019_172050_inLine
+BABEL_OP3_404_56090_20141019_172050_outLine
+BABEL_OP3_404_56198_20141103_031752_inLine
+BABEL_OP3_404_56198_20141103_031752_outLine
+BABEL_OP3_404_56307_20141201_210608_inLine
+BABEL_OP3_404_56307_20141201_210608_outLine
+BABEL_OP3_404_56370_20141010_013542_inLine
+BABEL_OP3_404_56370_20141010_013542_outLine
+BABEL_OP3_404_56429_20141024_003551_inLine
+BABEL_OP3_404_56429_20141024_003551_outLine
+BABEL_OP3_404_56523_20141114_215534_inLine
+BABEL_OP3_404_56523_20141114_215534_outLine
+BABEL_OP3_404_56720_20141129_182808_inLine
+BABEL_OP3_404_56720_20141129_182808_outLine
+BABEL_OP3_404_56720_20141129_183649_inLine
+BABEL_OP3_404_56720_20141129_183649_outLine
+BABEL_OP3_404_57093_20141118_034107_inLine
+BABEL_OP3_404_57093_20141118_034107_outLine
+BABEL_OP3_404_57116_20141008_023139_inLine
+BABEL_OP3_404_57116_20141008_023139_outLine
+BABEL_OP3_404_57529_20141201_050129_inLine
+BABEL_OP3_404_57529_20141201_050129_outLine
+BABEL_OP3_404_57548_20141119_194430_inLine
+BABEL_OP3_404_57548_20141119_194430_outLine
+BABEL_OP3_404_57609_20141117_063904_inLine
+BABEL_OP3_404_57609_20141117_063904_outLine
+BABEL_OP3_404_57609_20141119_223552_inLine
+BABEL_OP3_404_57609_20141119_223552_outLine
+BABEL_OP3_404_57922_20141119_172249_inLine
+BABEL_OP3_404_57922_20141119_172249_outLine
+BABEL_OP3_404_57935_20141122_233816_inLine
+BABEL_OP3_404_57935_20141122_233816_outLine
+BABEL_OP3_404_58107_20141107_223929_inLine
+BABEL_OP3_404_58107_20141107_223929_outLine
+BABEL_OP3_404_58145_20141120_014653_inLine
+BABEL_OP3_404_58145_20141120_014653_outLine
+BABEL_OP3_404_58489_20141201_035927_inLine
+BABEL_OP3_404_58489_20141201_035927_outLine
+BABEL_OP3_404_58717_20141106_221300_inLine
+BABEL_OP3_404_58717_20141106_221300_outLine
+BABEL_OP3_404_58734_20141019_223233_inLine
+BABEL_OP3_404_58734_20141019_223233_outLine
+BABEL_OP3_404_58815_20141129_230108_inLine
+BABEL_OP3_404_58815_20141129_230108_outLine
+BABEL_OP3_404_58821_20141128_224222_inLine
+BABEL_OP3_404_58821_20141128_224222_outLine
+BABEL_OP3_404_58850_20141116_234915_inLine
+BABEL_OP3_404_58850_20141116_234915_outLine
+BABEL_OP3_404_58926_20141105_025457_inLine
+BABEL_OP3_404_58926_20141105_025457_outLine
+BABEL_OP3_404_59163_20150212_233430_inLine
+BABEL_OP3_404_59163_20150212_233430_outLine
+BABEL_OP3_404_59291_20141129_223855_inLine
+BABEL_OP3_404_59291_20141129_223855_outLine
+BABEL_OP3_404_59509_20141120_010036_inLine
+BABEL_OP3_404_59509_20141120_010036_outLine
+BABEL_OP3_404_59747_20141020_002625_inLine
+BABEL_OP3_404_59747_20141020_002625_outLine
+BABEL_OP3_404_59928_20141107_063850_inLine
+BABEL_OP3_404_59928_20141107_063850_outLine
+BABEL_OP3_404_59993_20141102_204023_inLine
+BABEL_OP3_404_59993_20141102_204023_outLine
+BABEL_OP3_404_60115_20141123_045055_inLine
+BABEL_OP3_404_60115_20141123_045055_outLine
+BABEL_OP3_404_60418_20141201_012853_inLine
+BABEL_OP3_404_60418_20141201_012853_outLine
+BABEL_OP3_404_60538_20141010_000421_inLine
+BABEL_OP3_404_60538_20141010_000421_outLine
+BABEL_OP3_404_60661_20141023_185331_inLine
+BABEL_OP3_404_60661_20141023_185331_outLine
+BABEL_OP3_404_60830_20141119_050849_inLine
+BABEL_OP3_404_60830_20141119_050849_outLine
+BABEL_OP3_404_60836_20141026_014449_inLine
+BABEL_OP3_404_60836_20141026_014449_outLine
+BABEL_OP3_404_61011_20141022_235244_inLine
+BABEL_OP3_404_61011_20141022_235244_outLine
+BABEL_OP3_404_61357_20141118_052326_inLine
+BABEL_OP3_404_61357_20141118_052326_outLine
+BABEL_OP3_404_61731_20141026_185743_inLine
+BABEL_OP3_404_61731_20141026_185743_outLine
+BABEL_OP3_404_62014_20141120_021455_inLine
+BABEL_OP3_404_62014_20141120_021455_outLine
+BABEL_OP3_404_62177_20150503_025324_inLine
+BABEL_OP3_404_62177_20150503_025324_outLine
+BABEL_OP3_404_62200_20141115_024033_inLine
+BABEL_OP3_404_62200_20141115_024033_outLine
+BABEL_OP3_404_62289_20150526_045908_inLine
+BABEL_OP3_404_62289_20150526_045908_outLine
+BABEL_OP3_404_62430_20150526_181036_inLine
+BABEL_OP3_404_62430_20150526_181036_outLine
+BABEL_OP3_404_62434_20141019_201121_inLine
+BABEL_OP3_404_62434_20141019_201121_outLine
+BABEL_OP3_404_62656_20150119_185511_inLine
+BABEL_OP3_404_62656_20150119_185511_outLine
+BABEL_OP3_404_62800_20141020_020318_inLine
+BABEL_OP3_404_62800_20141020_020318_outLine
+BABEL_OP3_404_62835_20141119_043323_inLine
+BABEL_OP3_404_62835_20141119_043323_outLine
+BABEL_OP3_404_62976_20141119_061748_inLine
+BABEL_OP3_404_62976_20141119_061748_outLine
+BABEL_OP3_404_63307_20141119_192444_inLine
+BABEL_OP3_404_63307_20141119_192444_outLine
+BABEL_OP3_404_63445_20141021_013007_inLine
+BABEL_OP3_404_63445_20141021_013007_outLine
+BABEL_OP3_404_63523_20150512_050203_inLine
+BABEL_OP3_404_63523_20150512_050203_outLine
+BABEL_OP3_404_63604_20141011_021042_inLine
+BABEL_OP3_404_63604_20141011_021042_outLine
+BABEL_OP3_404_63787_20141010_225937_inLine
+BABEL_OP3_404_63787_20141010_225937_outLine
+BABEL_OP3_404_63938_20150526_052814_inLine
+BABEL_OP3_404_63938_20150526_052814_outLine
+BABEL_OP3_404_64350_20141022_195842_inLine
+BABEL_OP3_404_64350_20141022_195842_outLine
+BABEL_OP3_404_64398_20141126_031756_inLine
+BABEL_OP3_404_64398_20141126_031756_outLine
+BABEL_OP3_404_64902_20150522_041540_inLine
+BABEL_OP3_404_64902_20150522_041540_outLine
+BABEL_OP3_404_65064_20141127_003631_inLine
+BABEL_OP3_404_65064_20141127_003631_outLine
+BABEL_OP3_404_65077_20141015_025834_inLine
+BABEL_OP3_404_65077_20141015_025834_outLine
+BABEL_OP3_404_65466_20150524_182317_inLine
+BABEL_OP3_404_65466_20150524_182317_outLine
+BABEL_OP3_404_65477_20141115_020305_inLine
+BABEL_OP3_404_65477_20141115_020305_outLine
+BABEL_OP3_404_65692_20141117_074414_inLine
+BABEL_OP3_404_65692_20141117_074414_outLine
+BABEL_OP3_404_65723_20141102_051040_inLine
+BABEL_OP3_404_65723_20141102_051040_outLine
+BABEL_OP3_404_65882_20141024_191236_inLine
+BABEL_OP3_404_65882_20141024_191236_outLine
+BABEL_OP3_404_66001_20141006_015944_inLine
+BABEL_OP3_404_66001_20141006_015944_outLine
+BABEL_OP3_404_66026_20141130_061639_inLine
+BABEL_OP3_404_66026_20141130_061639_outLine
+BABEL_OP3_404_66350_20150212_043953_inLine
+BABEL_OP3_404_66350_20150212_043953_outLine
+BABEL_OP3_404_66959_20141130_212725_inLine
+BABEL_OP3_404_66959_20141130_212725_outLine
+BABEL_OP3_404_66975_20150119_001417_inLine
+BABEL_OP3_404_66975_20150119_001417_outLine
+BABEL_OP3_404_67066_20150611_043029_inLine
+BABEL_OP3_404_67066_20150611_043029_outLine
+BABEL_OP3_404_67283_20141008_234315_inLine
+BABEL_OP3_404_67283_20141008_234315_outLine
+BABEL_OP3_404_67373_20141106_191525_inLine
+BABEL_OP3_404_67373_20141106_191525_outLine
+BABEL_OP3_404_67373_20141106_192955_inLine
+BABEL_OP3_404_67373_20141106_192955_outLine
+BABEL_OP3_404_67622_20141021_002234_inLine
+BABEL_OP3_404_67622_20141021_002234_outLine
+BABEL_OP3_404_67659_20141101_010904_inLine
+BABEL_OP3_404_67659_20141101_010904_outLine
+BABEL_OP3_404_67964_20150515_011635_inLine
+BABEL_OP3_404_67964_20150515_011635_outLine
+BABEL_OP3_404_68040_20141118_235516_inLine
+BABEL_OP3_404_68040_20141118_235516_outLine
+BABEL_OP3_404_68748_20141123_003226_inLine
+BABEL_OP3_404_68748_20141123_003226_outLine
+BABEL_OP3_404_68854_20150512_025452_inLine
+BABEL_OP3_404_68854_20150512_025452_outLine
+BABEL_OP3_404_68924_20141119_025325_inLine
+BABEL_OP3_404_68924_20141119_025325_outLine
+BABEL_OP3_404_69992_20141014_035441_inLine
+BABEL_OP3_404_69992_20141014_035441_outLine
+BABEL_OP3_404_70110_20141020_043016_inLine
+BABEL_OP3_404_70110_20141020_043016_outLine
+BABEL_OP3_404_70251_20141009_221726_inLine
+BABEL_OP3_404_70251_20141009_221726_outLine
+BABEL_OP3_404_70293_20150118_220441_inLine
+BABEL_OP3_404_70293_20150118_220441_outLine
+BABEL_OP3_404_70343_20141126_030147_inLine
+BABEL_OP3_404_70343_20141126_030147_outLine
+BABEL_OP3_404_70386_20141029_002717_inLine
+BABEL_OP3_404_70386_20141029_002717_outLine
+BABEL_OP3_404_70452_20141028_031043_inLine
+BABEL_OP3_404_70452_20141028_031043_outLine
+BABEL_OP3_404_70601_20141103_194852_inLine
+BABEL_OP3_404_70601_20141103_194852_outLine
+BABEL_OP3_404_71704_20141021_001821_inLine
+BABEL_OP3_404_71704_20141021_001821_outLine
+BABEL_OP3_404_71704_20141021_002603_inLine
+BABEL_OP3_404_71704_20141021_002603_outLine
+BABEL_OP3_404_72007_20141201_045843_inLine
+BABEL_OP3_404_72007_20141201_045843_outLine
+BABEL_OP3_404_72040_20141103_035957_inLine
+BABEL_OP3_404_72040_20141103_035957_outLine
+BABEL_OP3_404_72040_20141103_042101_inLine
+BABEL_OP3_404_72040_20141103_042101_outLine
+BABEL_OP3_404_72110_20141128_013317_inLine
+BABEL_OP3_404_72110_20141128_013317_outLine
+BABEL_OP3_404_72324_20141201_013717_inLine
+BABEL_OP3_404_72324_20141201_013717_outLine
+BABEL_OP3_404_72654_20141110_003307_inLine
+BABEL_OP3_404_72654_20141110_003307_outLine
+BABEL_OP3_404_73042_20141022_163748_inLine
+BABEL_OP3_404_73042_20141022_163748_outLine
+BABEL_OP3_404_73301_20141101_210322_inLine
+BABEL_OP3_404_73301_20141101_210322_outLine
+BABEL_OP3_404_73446_20150513_002217_inLine
+BABEL_OP3_404_73446_20150513_002217_outLine
+BABEL_OP3_404_73511_20141129_045420_inLine
+BABEL_OP3_404_73511_20141129_045420_outLine
+BABEL_OP3_404_73549_20150619_204148_inLine
+BABEL_OP3_404_73549_20150619_204148_outLine
+BABEL_OP3_404_73591_20141018_022404_inLine
+BABEL_OP3_404_73591_20141018_022404_outLine
+BABEL_OP3_404_73622_20141016_060513_inLine
+BABEL_OP3_404_73622_20141016_060513_outLine
+BABEL_OP3_404_73814_20141120_180559_inLine
+BABEL_OP3_404_73814_20141120_180559_outLine
+BABEL_OP3_404_74226_20141130_235823_inLine
+BABEL_OP3_404_74226_20141130_235823_outLine
+BABEL_OP3_404_74253_20141201_231036_inLine
+BABEL_OP3_404_74253_20141201_231036_outLine
+BABEL_OP3_404_74280_20141010_230433_inLine
+BABEL_OP3_404_74280_20141010_230433_outLine
+BABEL_OP3_404_74667_20141114_221123_inLine
+BABEL_OP3_404_74667_20141114_221123_outLine
+BABEL_OP3_404_74886_20141022_200909_inLine
+BABEL_OP3_404_74886_20141022_200909_outLine
+BABEL_OP3_404_74921_20141124_030609_inLine
+BABEL_OP3_404_74921_20141124_030609_outLine
+BABEL_OP3_404_75223_20141012_224637_inLine
+BABEL_OP3_404_75223_20141012_224637_outLine
+BABEL_OP3_404_75342_20141130_193132_inLine
+BABEL_OP3_404_75342_20141130_193132_outLine
+BABEL_OP3_404_75930_20150206_063407_inLine
+BABEL_OP3_404_75930_20150206_063407_outLine
+BABEL_OP3_404_75993_20141102_192754_inLine
+BABEL_OP3_404_75993_20141102_192754_outLine
+BABEL_OP3_404_76155_20141118_052757_inLine
+BABEL_OP3_404_76155_20141118_052757_outLine
+BABEL_OP3_404_76218_20141119_232010_inLine
+BABEL_OP3_404_76218_20141119_232010_outLine
+BABEL_OP3_404_76499_20141117_005535_inLine
+BABEL_OP3_404_76499_20141117_005535_outLine
+BABEL_OP3_404_76756_20141120_014151_inLine
+BABEL_OP3_404_76756_20141120_014151_outLine
+BABEL_OP3_404_77033_20150503_233304_inLine
+BABEL_OP3_404_77033_20150503_233304_outLine
+BABEL_OP3_404_77112_20141105_062419_inLine
+BABEL_OP3_404_77112_20141105_062419_outLine
+BABEL_OP3_404_77139_20141022_022951_inLine
+BABEL_OP3_404_77139_20141022_022951_outLine
+BABEL_OP3_404_77744_20141103_034001_inLine
+BABEL_OP3_404_77744_20141103_034001_outLine
+BABEL_OP3_404_78116_20141128_231322_inLine
+BABEL_OP3_404_78116_20141128_231322_outLine
+BABEL_OP3_404_78194_20141019_052949_inLine
+BABEL_OP3_404_78194_20141019_052949_outLine
+BABEL_OP3_404_78398_20141022_235403_inLine
+BABEL_OP3_404_78398_20141022_235403_outLine
+BABEL_OP3_404_78544_20141130_192658_inLine
+BABEL_OP3_404_78544_20141130_192658_outLine
+BABEL_OP3_404_78604_20141022_164244_inLine
+BABEL_OP3_404_78604_20141022_164244_outLine
+BABEL_OP3_404_78630_20141025_220904_inLine
+BABEL_OP3_404_78630_20141025_220904_outLine
+BABEL_OP3_404_78743_20141202_001451_inLine
+BABEL_OP3_404_78743_20141202_001451_outLine
+BABEL_OP3_404_78943_20141025_004503_inLine
+BABEL_OP3_404_78943_20141025_004503_outLine
+BABEL_OP3_404_79028_20150213_002817_inLine
+BABEL_OP3_404_79028_20150213_002817_outLine
+BABEL_OP3_404_79107_20150614_013139_inLine
+BABEL_OP3_404_79107_20150614_013139_outLine
+BABEL_OP3_404_79129_20141110_183305_inLine
+BABEL_OP3_404_79129_20141110_183305_outLine
+BABEL_OP3_404_79367_20141008_232735_inLine
+BABEL_OP3_404_79367_20141008_232735_outLine
+BABEL_OP3_404_79451_20141031_025601_inLine
+BABEL_OP3_404_79451_20141031_025601_outLine
+BABEL_OP3_404_79995_20141201_013108_inLine
+BABEL_OP3_404_79995_20141201_013108_outLine
+BABEL_OP3_404_80622_20141119_054644_inLine
+BABEL_OP3_404_80622_20141119_054644_outLine
+BABEL_OP3_404_80721_20141201_013404_inLine
+BABEL_OP3_404_80721_20141201_013404_outLine
+BABEL_OP3_404_81287_20141130_024232_inLine
+BABEL_OP3_404_81287_20141130_024232_outLine
+BABEL_OP3_404_81392_20141130_022613_inLine
+BABEL_OP3_404_81392_20141130_022613_outLine
+BABEL_OP3_404_81392_20141130_023326_inLine
+BABEL_OP3_404_81392_20141130_023326_outLine
+BABEL_OP3_404_81404_20141104_055546_inLine
+BABEL_OP3_404_81404_20141104_055546_outLine
+BABEL_OP3_404_81433_20141119_073031_inLine
+BABEL_OP3_404_81433_20141119_073031_outLine
+BABEL_OP3_404_81435_20141128_235050_inLine
+BABEL_OP3_404_81435_20141128_235050_outLine
+BABEL_OP3_404_81622_20141129_212937_inLine
+BABEL_OP3_404_81622_20141129_212937_outLine
+BABEL_OP3_404_81810_20141126_051528_inLine
+BABEL_OP3_404_81810_20141126_051528_outLine
+BABEL_OP3_404_82030_20150517_193420_inLine
+BABEL_OP3_404_82030_20150517_193420_outLine
+BABEL_OP3_404_82035_20141119_063429_inLine
+BABEL_OP3_404_82035_20141119_063429_outLine
+BABEL_OP3_404_82138_20141116_234338_inLine
+BABEL_OP3_404_82138_20141116_234338_outLine
+BABEL_OP3_404_82140_20141117_021927_inLine
+BABEL_OP3_404_82140_20141117_021927_outLine
+BABEL_OP3_404_82145_20150502_232707_inLine
+BABEL_OP3_404_82145_20150502_232707_outLine
+BABEL_OP3_404_82391_20141128_063323_inLine
+BABEL_OP3_404_82391_20141128_063323_outLine
+BABEL_OP3_404_82496_20141009_062659_inLine
+BABEL_OP3_404_82496_20141009_062659_outLine
+BABEL_OP3_404_82622_20141008_042910_inLine
+BABEL_OP3_404_82622_20141008_042910_outLine
+BABEL_OP3_404_82904_20150523_231750_inLine
+BABEL_OP3_404_82904_20150523_231750_outLine
+BABEL_OP3_404_83455_20141112_000643_inLine
+BABEL_OP3_404_83455_20141112_000643_outLine
+BABEL_OP3_404_83783_20141115_005815_inLine
+BABEL_OP3_404_83783_20141115_005815_outLine
+BABEL_OP3_404_83935_20141201_214527_inLine
+BABEL_OP3_404_83935_20141201_214527_outLine
+BABEL_OP3_404_84327_20141130_185722_inLine
+BABEL_OP3_404_84327_20141130_185722_outLine
+BABEL_OP3_404_84408_20141105_182756_inLine
+BABEL_OP3_404_84408_20141105_182756_outLine
+BABEL_OP3_404_84469_20141130_030156_inLine
+BABEL_OP3_404_84469_20141130_030156_outLine
+BABEL_OP3_404_84547_20141022_025230_inLine
+BABEL_OP3_404_84547_20141022_025230_outLine
+BABEL_OP3_404_84605_20141026_234127_inLine
+BABEL_OP3_404_84605_20141026_234127_outLine
+BABEL_OP3_404_84611_20141024_005352_inLine
+BABEL_OP3_404_84611_20141024_005352_outLine
+BABEL_OP3_404_84768_20141012_183416_inLine
+BABEL_OP3_404_84768_20141012_183416_outLine
+BABEL_OP3_404_84823_20141201_061552_inLine
+BABEL_OP3_404_84823_20141201_061552_outLine
+BABEL_OP3_404_84936_20141130_025359_inLine
+BABEL_OP3_404_84936_20141130_025359_outLine
+BABEL_OP3_404_85647_20141111_231451_inLine
+BABEL_OP3_404_85647_20141111_231451_outLine
+BABEL_OP3_404_86321_20141127_025302_inLine
+BABEL_OP3_404_86321_20141127_025302_outLine
+BABEL_OP3_404_86433_20141201_005203_inLine
+BABEL_OP3_404_86433_20141201_005203_outLine
+BABEL_OP3_404_86433_20141201_010208_inLine
+BABEL_OP3_404_86433_20141201_010208_outLine
+BABEL_OP3_404_86433_20141201_011757_inLine
+BABEL_OP3_404_86433_20141201_011757_outLine
+BABEL_OP3_404_86467_20141019_022847_inLine
+BABEL_OP3_404_86467_20141019_022847_outLine
+BABEL_OP3_404_86467_20141019_024243_inLine
+BABEL_OP3_404_86467_20141019_024243_outLine
+BABEL_OP3_404_86557_20141021_041027_inLine
+BABEL_OP3_404_86557_20141021_041027_outLine
+BABEL_OP3_404_86676_20141125_223657_inLine
+BABEL_OP3_404_86676_20141125_223657_outLine
+BABEL_OP3_404_86952_20141008_194318_inLine
+BABEL_OP3_404_86952_20141008_194318_outLine
+BABEL_OP3_404_87073_20141007_223759_inLine
+BABEL_OP3_404_87073_20141007_223759_outLine
+BABEL_OP3_404_87280_20141201_232519_inLine
+BABEL_OP3_404_87280_20141201_232519_outLine
+BABEL_OP3_404_87693_20141105_002311_inLine
+BABEL_OP3_404_87693_20141105_002311_outLine
+BABEL_OP3_404_88601_20141115_021916_inLine
+BABEL_OP3_404_88601_20141115_021916_outLine
+BABEL_OP3_404_88601_20141115_024632_inLine
+BABEL_OP3_404_88601_20141115_024632_outLine
+BABEL_OP3_404_88686_20141019_023828_inLine
+BABEL_OP3_404_88686_20141019_023828_outLine
+BABEL_OP3_404_88925_20141201_043633_inLine
+BABEL_OP3_404_88925_20141201_043633_outLine
+BABEL_OP3_404_88982_20141106_212556_inLine
+BABEL_OP3_404_88982_20141106_212556_outLine
+BABEL_OP3_404_89358_20141119_055634_inLine
+BABEL_OP3_404_89358_20141119_055634_outLine
+BABEL_OP3_404_89695_20141115_212119_inLine
+BABEL_OP3_404_89695_20141115_212119_outLine
+BABEL_OP3_404_89794_20141130_055655_inLine
+BABEL_OP3_404_89794_20141130_055655_outLine
+BABEL_OP3_404_89877_20141120_061055_inLine
+BABEL_OP3_404_89877_20141120_061055_outLine
+BABEL_OP3_404_90417_20150611_052409_inLine
+BABEL_OP3_404_90417_20150611_052409_outLine
+BABEL_OP3_404_90737_20141116_233627_inLine
+BABEL_OP3_404_90737_20141116_233627_outLine
+BABEL_OP3_404_90739_20141116_034352_inLine
+BABEL_OP3_404_90739_20141116_034352_outLine
+BABEL_OP3_404_90777_20141115_012657_inLine
+BABEL_OP3_404_90777_20141115_012657_outLine
+BABEL_OP3_404_90935_20141104_195620_inLine
+BABEL_OP3_404_90935_20141104_195620_outLine
+BABEL_OP3_404_91080_20141119_062453_inLine
+BABEL_OP3_404_91080_20141119_062453_outLine
+BABEL_OP3_404_91125_20141010_234127_inLine
+BABEL_OP3_404_91125_20141010_234127_outLine
+BABEL_OP3_404_91336_20141110_011202_inLine
+BABEL_OP3_404_91336_20141110_011202_outLine
+BABEL_OP3_404_92065_20141201_041019_inLine
+BABEL_OP3_404_92065_20141201_041019_outLine
+BABEL_OP3_404_92077_20150610_053919_inLine
+BABEL_OP3_404_92077_20150610_053919_outLine
+BABEL_OP3_404_92459_20141026_000227_inLine
+BABEL_OP3_404_92459_20141026_000227_outLine
+BABEL_OP3_404_92459_20141026_000839_inLine
+BABEL_OP3_404_92459_20141026_000839_outLine
+BABEL_OP3_404_92509_20141020_034921_inLine
+BABEL_OP3_404_92509_20141020_034921_outLine
+BABEL_OP3_404_92527_20141115_024550_inLine
+BABEL_OP3_404_92527_20141115_024550_outLine
+BABEL_OP3_404_92809_20141009_080406_inLine
+BABEL_OP3_404_92809_20141009_080406_outLine
+BABEL_OP3_404_92886_20141103_032433_inLine
+BABEL_OP3_404_92886_20141103_032433_outLine
+BABEL_OP3_404_92941_20141027_175733_inLine
+BABEL_OP3_404_92941_20141027_175733_outLine
+BABEL_OP3_404_92941_20141027_180356_inLine
+BABEL_OP3_404_92941_20141027_180356_outLine
+BABEL_OP3_404_93224_20141119_210156_inLine
+BABEL_OP3_404_93224_20141119_210156_outLine
+BABEL_OP3_404_93411_20141119_193212_inLine
+BABEL_OP3_404_93411_20141119_193212_outLine
+BABEL_OP3_404_93861_20141111_181324_inLine
+BABEL_OP3_404_93861_20141111_181324_outLine
+BABEL_OP3_404_93946_20141129_015946_inLine
+BABEL_OP3_404_93946_20141129_015946_outLine
+BABEL_OP3_404_93964_20141111_213251_inLine
+BABEL_OP3_404_93964_20141111_213251_outLine
+BABEL_OP3_404_94141_20150516_175827_inLine
+BABEL_OP3_404_94141_20150516_175827_outLine
+BABEL_OP3_404_94253_20141029_184039_inLine
+BABEL_OP3_404_94253_20141029_184039_outLine
+BABEL_OP3_404_94409_20141117_003829_inLine
+BABEL_OP3_404_94409_20141117_003829_outLine
+BABEL_OP3_404_94666_20141119_231115_inLine
+BABEL_OP3_404_94666_20141119_231115_outLine
+BABEL_OP3_404_94745_20141201_033432_inLine
+BABEL_OP3_404_94745_20141201_033432_outLine
+BABEL_OP3_404_94923_20141116_230334_inLine
+BABEL_OP3_404_94923_20141116_230334_outLine
+BABEL_OP3_404_94978_20150528_024921_inLine
+BABEL_OP3_404_94978_20150528_024921_outLine
+BABEL_OP3_404_95294_20141129_062228_inLine
+BABEL_OP3_404_95294_20141129_062228_outLine
+BABEL_OP3_404_95467_20150612_031400_inLine
+BABEL_OP3_404_95467_20150612_031400_outLine
+BABEL_OP3_404_95490_20141021_050016_inLine
+BABEL_OP3_404_95490_20141021_050016_outLine
+BABEL_OP3_404_95663_20141022_043520_inLine
+BABEL_OP3_404_95663_20141022_043520_outLine
+BABEL_OP3_404_95670_20141019_224431_inLine
+BABEL_OP3_404_95670_20141019_224431_outLine
+BABEL_OP3_404_95677_20150220_205948_inLine
+BABEL_OP3_404_95677_20150220_205948_outLine
+BABEL_OP3_404_95942_20150514_235402_inLine
+BABEL_OP3_404_95942_20150514_235402_outLine
+BABEL_OP3_404_96088_20150524_191148_inLine
+BABEL_OP3_404_96088_20150524_191148_outLine
+BABEL_OP3_404_96190_20141107_040725_inLine
+BABEL_OP3_404_96190_20141107_040725_outLine
+BABEL_OP3_404_96405_20141026_045704_inLine
+BABEL_OP3_404_96405_20141026_045704_outLine
+BABEL_OP3_404_96820_20141109_204448_inLine
+BABEL_OP3_404_96820_20141109_204448_outLine
+BABEL_OP3_404_96842_20150610_040559_inLine
+BABEL_OP3_404_96842_20150610_040559_outLine
+BABEL_OP3_404_96910_20141026_195400_inLine
+BABEL_OP3_404_96910_20141026_195400_outLine
+BABEL_OP3_404_96934_20141025_223703_inLine
+BABEL_OP3_404_96934_20141025_223703_outLine
+BABEL_OP3_404_96934_20141025_225156_inLine
+BABEL_OP3_404_96934_20141025_225156_outLine
+BABEL_OP3_404_96985_20141013_053332_inLine
+BABEL_OP3_404_96985_20141013_053332_outLine
+BABEL_OP3_404_97363_20141120_034843_inLine
+BABEL_OP3_404_97363_20141120_034843_outLine
+BABEL_OP3_404_97570_20141120_050344_inLine
+BABEL_OP3_404_97570_20141120_050344_outLine
+BABEL_OP3_404_98311_20141022_042555_inLine
+BABEL_OP3_404_98311_20141022_042555_outLine
+BABEL_OP3_404_98356_20141123_013523_inLine
+BABEL_OP3_404_98356_20141123_013523_outLine
+BABEL_OP3_404_98390_20141014_024134_inLine
+BABEL_OP3_404_98390_20141014_024134_outLine
+BABEL_OP3_404_98565_20150217_195949_inLine
+BABEL_OP3_404_98565_20150217_195949_outLine
+BABEL_OP3_404_98580_20141130_022138_inLine
+BABEL_OP3_404_98580_20141130_022138_outLine
+BABEL_OP3_404_98909_20141027_032903_inLine
+BABEL_OP3_404_98909_20141027_032903_outLine
+BABEL_OP3_404_99516_20141019_071828_inLine
+BABEL_OP3_404_99516_20141019_071828_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/sub-train.list b/egs/babel/s5d/conf/lists/404-georgian/sub-train.list
new file mode 100644
index 00000000000..a042ee569ef
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/sub-train.list
@@ -0,0 +1,124 @@
+BABEL_OP3_404_11663_20141118_032146_inLine
+BABEL_OP3_404_11663_20141118_032146_outLine
+BABEL_OP3_404_12242_20141028_021853_inLine
+BABEL_OP3_404_12242_20141028_021853_outLine
+BABEL_OP3_404_13178_20141129_192909_inLine
+BABEL_OP3_404_13178_20141129_192909_outLine
+BABEL_OP3_404_14137_20141025_202817_inLine
+BABEL_OP3_404_14137_20141025_202817_outLine
+BABEL_OP3_404_14875_20141026_230227_inLine
+BABEL_OP3_404_14875_20141026_230227_outLine
+BABEL_OP3_404_15869_20150218_225936_inLine
+BABEL_OP3_404_15869_20150218_225936_outLine
+BABEL_OP3_404_17113_20150611_050102_inLine
+BABEL_OP3_404_17113_20150611_050102_outLine
+BABEL_OP3_404_23505_20141021_032033_inLine
+BABEL_OP3_404_23505_20141021_032033_outLine
+BABEL_OP3_404_24470_20141111_184651_inLine
+BABEL_OP3_404_24470_20141111_184651_outLine
+BABEL_OP3_404_24470_20141111_190229_inLine
+BABEL_OP3_404_24470_20141111_190229_outLine
+BABEL_OP3_404_24679_20141018_015615_inLine
+BABEL_OP3_404_24679_20141018_015615_outLine
+BABEL_OP3_404_26388_20141026_014207_inLine
+BABEL_OP3_404_26388_20141026_014207_outLine
+BABEL_OP3_404_27042_20141201_215107_inLine
+BABEL_OP3_404_27042_20141201_215107_outLine
+BABEL_OP3_404_28538_20141119_005526_inLine
+BABEL_OP3_404_28538_20141119_005526_outLine
+BABEL_OP3_404_29208_20141106_013309_inLine
+BABEL_OP3_404_29208_20141106_013309_outLine
+BABEL_OP3_404_30461_20150620_020316_inLine
+BABEL_OP3_404_30461_20150620_020316_outLine
+BABEL_OP3_404_31979_20141106_000523_inLine
+BABEL_OP3_404_31979_20141106_000523_outLine
+BABEL_OP3_404_31992_20141014_221817_inLine
+BABEL_OP3_404_31992_20141014_221817_outLine
+BABEL_OP3_404_37064_20141102_063308_inLine
+BABEL_OP3_404_37064_20141102_063308_outLine
+BABEL_OP3_404_37281_20141119_053453_inLine
+BABEL_OP3_404_37281_20141119_053453_outLine
+BABEL_OP3_404_37853_20150602_030625_inLine
+BABEL_OP3_404_37853_20150602_030625_outLine
+BABEL_OP3_404_40713_20141028_221207_inLine
+BABEL_OP3_404_40713_20141028_221207_outLine
+BABEL_OP3_404_41680_20141012_040411_inLine
+BABEL_OP3_404_41680_20141012_040411_outLine
+BABEL_OP3_404_41920_20141008_040539_inLine
+BABEL_OP3_404_41920_20141008_040539_outLine
+BABEL_OP3_404_42877_20150212_052937_inLine
+BABEL_OP3_404_42877_20150212_052937_outLine
+BABEL_OP3_404_45121_20150609_055234_inLine
+BABEL_OP3_404_45121_20150609_055234_outLine
+BABEL_OP3_404_46169_20141130_224339_inLine
+BABEL_OP3_404_46169_20141130_224339_outLine
+BABEL_OP3_404_46625_20141011_040505_inLine
+BABEL_OP3_404_46625_20141011_040505_outLine
+BABEL_OP3_404_46681_20141021_040451_inLine
+BABEL_OP3_404_46681_20141021_040451_outLine
+BABEL_OP3_404_47270_20150512_053415_inLine
+BABEL_OP3_404_47270_20150512_053415_outLine
+BABEL_OP3_404_48844_20141020_065414_inLine
+BABEL_OP3_404_48844_20141020_065414_outLine
+BABEL_OP3_404_49768_20141026_022902_inLine
+BABEL_OP3_404_49768_20141026_022902_outLine
+BABEL_OP3_404_50175_20141021_025726_inLine
+BABEL_OP3_404_50175_20141021_025726_outLine
+BABEL_OP3_404_52301_20141009_051739_inLine
+BABEL_OP3_404_52301_20141009_051739_outLine
+BABEL_OP3_404_52301_20141009_054049_inLine
+BABEL_OP3_404_52301_20141009_054049_outLine
+BABEL_OP3_404_52490_20141016_020323_inLine
+BABEL_OP3_404_52490_20141016_020323_outLine
+BABEL_OP3_404_56213_20141201_000837_inLine
+BABEL_OP3_404_56213_20141201_000837_outLine
+BABEL_OP3_404_58103_20141030_002209_inLine
+BABEL_OP3_404_58103_20141030_002209_outLine
+BABEL_OP3_404_59078_20141111_004941_inLine
+BABEL_OP3_404_59078_20141111_004941_outLine
+BABEL_OP3_404_61225_20141009_174003_inLine
+BABEL_OP3_404_61225_20141009_174003_outLine
+BABEL_OP3_404_63220_20141127_033605_inLine
+BABEL_OP3_404_63220_20141127_033605_outLine
+BABEL_OP3_404_64494_20141026_203549_inLine
+BABEL_OP3_404_64494_20141026_203549_outLine
+BABEL_OP3_404_64768_20141027_201818_inLine
+BABEL_OP3_404_64768_20141027_201818_outLine
+BABEL_OP3_404_66916_20141022_000731_inLine
+BABEL_OP3_404_66916_20141022_000731_outLine
+BABEL_OP3_404_67401_20141109_211809_inLine
+BABEL_OP3_404_67401_20141109_211809_outLine
+BABEL_OP3_404_68059_20141109_052011_inLine
+BABEL_OP3_404_68059_20141109_052011_outLine
+BABEL_OP3_404_68068_20141201_054518_inLine
+BABEL_OP3_404_68068_20141201_054518_outLine
+BABEL_OP3_404_68384_20141130_035214_inLine
+BABEL_OP3_404_68384_20141130_035214_outLine
+BABEL_OP3_404_68627_20141105_190511_inLine
+BABEL_OP3_404_68627_20141105_190511_outLine
+BABEL_OP3_404_72844_20141007_033837_inLine
+BABEL_OP3_404_72844_20141007_033837_outLine
+BABEL_OP3_404_73837_20141026_191037_inLine
+BABEL_OP3_404_73837_20141026_191037_outLine
+BABEL_OP3_404_78511_20141201_003606_inLine
+BABEL_OP3_404_78511_20141201_003606_outLine
+BABEL_OP3_404_79139_20141117_054733_inLine
+BABEL_OP3_404_79139_20141117_054733_outLine
+BABEL_OP3_404_81971_20141022_025641_inLine
+BABEL_OP3_404_81971_20141022_025641_outLine
+BABEL_OP3_404_83062_20150523_220236_inLine
+BABEL_OP3_404_83062_20150523_220236_outLine
+BABEL_OP3_404_83775_20141030_230742_inLine
+BABEL_OP3_404_83775_20141030_230742_outLine
+BABEL_OP3_404_84339_20150502_014143_inLine
+BABEL_OP3_404_84339_20150502_014143_outLine
+BABEL_OP3_404_86191_20141027_013544_inLine
+BABEL_OP3_404_86191_20141027_013544_outLine
+BABEL_OP3_404_86888_20141119_022459_inLine
+BABEL_OP3_404_86888_20141119_022459_outLine
+BABEL_OP3_404_95966_20141129_060246_inLine
+BABEL_OP3_404_95966_20141129_060246_outLine
+BABEL_OP3_404_97461_20141118_230730_inLine
+BABEL_OP3_404_97461_20141118_230730_outLine
+BABEL_OP3_404_99487_20141021_053024_inLine
+BABEL_OP3_404_99487_20141021_053024_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/sub-train.untranscribed.list b/egs/babel/s5d/conf/lists/404-georgian/sub-train.untranscribed.list
new file mode 100644
index 00000000000..32d863a65ad
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/sub-train.untranscribed.list
@@ -0,0 +1,929 @@
+BABEL_OP3_404_10019_20141101_191932_inLine
+BABEL_OP3_404_10019_20141101_191932_outLine
+BABEL_OP3_404_10058_20150526_034808_inLine
+BABEL_OP3_404_10411_20150611_172027_inLine
+BABEL_OP3_404_10411_20150611_172027_outLine
+BABEL_OP3_404_10416_20141117_064700_inLine
+BABEL_OP3_404_10416_20141117_064700_outLine
+BABEL_OP3_404_10647_20150514_001106_inLine
+BABEL_OP3_404_10647_20150514_001106_outLine
+BABEL_OP3_404_10938_20141030_023413_inLine
+BABEL_OP3_404_10938_20141030_023413_outLine
+BABEL_OP3_404_10974_20141119_205506_inLine
+BABEL_OP3_404_10974_20141119_205506_outLine
+BABEL_OP3_404_11352_20150513_002642_inLine
+BABEL_OP3_404_11352_20150513_002642_outLine
+BABEL_OP3_404_11673_20141023_035438_inLine
+BABEL_OP3_404_11673_20141023_035438_outLine
+BABEL_OP3_404_11681_20141107_190101_inLine
+BABEL_OP3_404_11681_20141107_190101_outLine
+BABEL_OP3_404_11859_20150611_041737_inLine
+BABEL_OP3_404_11859_20150611_041737_outLine
+BABEL_OP3_404_12220_20141116_205911_inLine
+BABEL_OP3_404_12220_20141116_205911_outLine
+BABEL_OP3_404_12609_20150524_172934_inLine
+BABEL_OP3_404_12609_20150524_172934_outLine
+BABEL_OP3_404_13030_20141101_200709_inLine
+BABEL_OP3_404_13030_20141101_200709_outLine
+BABEL_OP3_404_13126_20150524_221540_inLine
+BABEL_OP3_404_13126_20150524_221540_outLine
+BABEL_OP3_404_13324_20141022_200257_inLine
+BABEL_OP3_404_13324_20141022_200257_outLine
+BABEL_OP3_404_13664_20141012_013523_inLine
+BABEL_OP3_404_13664_20141012_013523_outLine
+BABEL_OP3_404_13709_20150512_015216_inLine
+BABEL_OP3_404_13709_20150512_015216_outLine
+BABEL_OP3_404_14158_20141130_030130_inLine
+BABEL_OP3_404_14158_20141130_030130_outLine
+BABEL_OP3_404_14229_20141029_200136_inLine
+BABEL_OP3_404_14229_20141029_200136_outLine
+BABEL_OP3_404_14237_20141006_171921_inLine
+BABEL_OP3_404_14237_20141006_171921_outLine
+BABEL_OP3_404_14440_20141127_213106_inLine
+BABEL_OP3_404_14440_20141127_213106_outLine
+BABEL_OP3_404_14807_20141110_231934_inLine
+BABEL_OP3_404_14807_20141110_231934_outLine
+BABEL_OP3_404_14899_20141022_202217_inLine
+BABEL_OP3_404_14899_20141022_202217_outLine
+BABEL_OP3_404_14929_20141129_192841_inLine
+BABEL_OP3_404_14929_20141129_192841_outLine
+BABEL_OP3_404_15024_20141118_234824_inLine
+BABEL_OP3_404_15024_20141118_234824_outLine
+BABEL_OP3_404_15042_20150506_232829_inLine
+BABEL_OP3_404_15042_20150506_232829_outLine
+BABEL_OP3_404_15382_20141130_213942_inLine
+BABEL_OP3_404_15382_20141130_213942_outLine
+BABEL_OP3_404_15535_20141129_021659_inLine
+BABEL_OP3_404_15535_20141129_021659_outLine
+BABEL_OP3_404_15638_20141127_220502_outLine
+BABEL_OP3_404_15848_20141006_231138_inLine
+BABEL_OP3_404_15848_20141006_231138_outLine
+BABEL_OP3_404_15902_20141020_173105_outLine
+BABEL_OP3_404_16149_20141010_173548_inLine
+BABEL_OP3_404_16149_20141010_173548_outLine
+BABEL_OP3_404_16467_20141130_014316_inLine
+BABEL_OP3_404_16467_20141130_014316_outLine
+BABEL_OP3_404_16467_20141130_015010_inLine
+BABEL_OP3_404_16467_20141130_015010_outLine
+BABEL_OP3_404_16475_20141116_052010_outLine
+BABEL_OP3_404_16601_20141201_041704_inLine
+BABEL_OP3_404_16601_20141201_041704_outLine
+BABEL_OP3_404_17280_20141103_190330_inLine
+BABEL_OP3_404_17280_20141103_190330_outLine
+BABEL_OP3_404_17320_20150524_213213_inLine
+BABEL_OP3_404_17320_20150524_213213_outLine
+BABEL_OP3_404_17420_20150503_201902_inLine
+BABEL_OP3_404_17420_20150503_201902_outLine
+BABEL_OP3_404_17420_20150527_025815_inLine
+BABEL_OP3_404_17420_20150527_025815_outLine
+BABEL_OP3_404_17420_20150527_034621_inLine
+BABEL_OP3_404_17420_20150527_034621_outLine
+BABEL_OP3_404_17520_20141113_032534_inLine
+BABEL_OP3_404_17567_20141117_182919_inLine
+BABEL_OP3_404_17567_20141117_182919_outLine
+BABEL_OP3_404_17573_20141129_035040_inLine
+BABEL_OP3_404_17573_20141129_035040_outLine
+BABEL_OP3_404_17615_20141201_025917_inLine
+BABEL_OP3_404_17615_20141201_025917_outLine
+BABEL_OP3_404_17890_20141128_040046_inLine
+BABEL_OP3_404_17890_20141128_040046_outLine
+BABEL_OP3_404_17923_20141022_231429_outLine
+BABEL_OP3_404_18118_20150503_165936_inLine
+BABEL_OP3_404_18118_20150503_165936_outLine
+BABEL_OP3_404_18291_20150611_062705_outLine
+BABEL_OP3_404_18291_20150611_063700_outLine
+BABEL_OP3_404_18766_20150610_064349_inLine
+BABEL_OP3_404_19120_20150525_014657_inLine
+BABEL_OP3_404_19120_20150525_014657_outLine
+BABEL_OP3_404_19120_20150525_015635_inLine
+BABEL_OP3_404_19120_20150525_015635_outLine
+BABEL_OP3_404_19134_20141120_053128_inLine
+BABEL_OP3_404_19134_20141120_053128_outLine
+BABEL_OP3_404_19703_20141027_004315_inLine
+BABEL_OP3_404_19703_20141027_004315_outLine
+BABEL_OP3_404_19877_20150506_202237_outLine
+BABEL_OP3_404_20133_20141010_195231_inLine
+BABEL_OP3_404_20133_20141010_195231_outLine
+BABEL_OP3_404_20454_20150218_171143_inLine
+BABEL_OP3_404_20454_20150218_171143_outLine
+BABEL_OP3_404_20985_20141126_183236_inLine
+BABEL_OP3_404_20985_20141126_183236_outLine
+BABEL_OP3_404_21004_20141201_035831_inLine
+BABEL_OP3_404_21004_20141201_035831_outLine
+BABEL_OP3_404_21159_20150615_021612_inLine
+BABEL_OP3_404_21435_20150523_030702_inLine
+BABEL_OP3_404_21435_20150523_030702_outLine
+BABEL_OP3_404_21581_20141101_011021_inLine
+BABEL_OP3_404_21581_20141101_011021_outLine
+BABEL_OP3_404_21807_20141112_225225_outLine
+BABEL_OP3_404_22280_20141111_020522_inLine
+BABEL_OP3_404_22280_20141111_020522_outLine
+BABEL_OP3_404_22591_20150217_220714_inLine
+BABEL_OP3_404_23046_20141031_030755_inLine
+BABEL_OP3_404_23046_20141031_030755_outLine
+BABEL_OP3_404_23731_20141130_033602_inLine
+BABEL_OP3_404_23731_20141130_033602_outLine
+BABEL_OP3_404_23980_20141106_225951_inLine
+BABEL_OP3_404_23980_20141106_225951_outLine
+BABEL_OP3_404_24209_20150212_224614_inLine
+BABEL_OP3_404_24239_20150517_203015_inLine
+BABEL_OP3_404_24270_20141111_012902_inLine
+BABEL_OP3_404_24270_20141111_012902_outLine
+BABEL_OP3_404_24323_20141117_020615_outLine
+BABEL_OP3_404_24501_20150522_030231_inLine
+BABEL_OP3_404_24532_20141007_211325_inLine
+BABEL_OP3_404_24532_20141007_211325_outLine
+BABEL_OP3_404_24586_20150524_190657_inLine
+BABEL_OP3_404_24586_20150524_190657_outLine
+BABEL_OP3_404_24589_20141031_020641_inLine
+BABEL_OP3_404_24589_20141031_020641_outLine
+BABEL_OP3_404_24590_20141116_230233_inLine
+BABEL_OP3_404_24590_20141116_230233_outLine
+BABEL_OP3_404_24982_20141102_021352_inLine
+BABEL_OP3_404_24982_20141102_021352_outLine
+BABEL_OP3_404_25068_20150206_022730_outLine
+BABEL_OP3_404_25085_20150611_040906_inLine
+BABEL_OP3_404_25085_20150611_040906_outLine
+BABEL_OP3_404_25412_20141120_031532_inLine
+BABEL_OP3_404_25412_20141120_031532_outLine
+BABEL_OP3_404_25496_20150613_034126_inLine
+BABEL_OP3_404_25496_20150613_034126_outLine
+BABEL_OP3_404_26398_20150527_032152_inLine
+BABEL_OP3_404_26398_20150527_032152_outLine
+BABEL_OP3_404_26478_20150617_004029_inLine
+BABEL_OP3_404_26478_20150617_004029_outLine
+BABEL_OP3_404_26836_20141102_024528_inLine
+BABEL_OP3_404_26836_20141102_024528_outLine
+BABEL_OP3_404_27203_20141119_185720_inLine
+BABEL_OP3_404_27203_20141119_185720_outLine
+BABEL_OP3_404_27203_20141119_191138_inLine
+BABEL_OP3_404_27203_20141119_191138_outLine
+BABEL_OP3_404_27590_20141128_051454_inLine
+BABEL_OP3_404_28280_20150619_024509_inLine
+BABEL_OP3_404_28280_20150619_024509_outLine
+BABEL_OP3_404_28280_20150619_025848_inLine
+BABEL_OP3_404_28280_20150619_025848_outLine
+BABEL_OP3_404_28303_20141028_182204_inLine
+BABEL_OP3_404_28303_20141028_182204_outLine
+BABEL_OP3_404_28522_20141124_222758_inLine
+BABEL_OP3_404_28522_20141124_222758_outLine
+BABEL_OP3_404_28600_20141201_223206_inLine
+BABEL_OP3_404_28600_20141201_223206_outLine
+BABEL_OP3_404_28871_20141019_181913_inLine
+BABEL_OP3_404_28871_20141019_181913_outLine
+BABEL_OP3_404_28945_20141104_060349_outLine
+BABEL_OP3_404_29039_20141128_035839_inLine
+BABEL_OP3_404_29039_20141128_035839_outLine
+BABEL_OP3_404_29076_20141109_215142_inLine
+BABEL_OP3_404_29076_20141109_215142_outLine
+BABEL_OP3_404_29230_20150611_051340_inLine
+BABEL_OP3_404_29230_20150611_051340_outLine
+BABEL_OP3_404_29439_20150524_201524_inLine
+BABEL_OP3_404_29439_20150524_201524_outLine
+BABEL_OP3_404_30098_20150610_150504_inLine
+BABEL_OP3_404_30098_20150610_150504_outLine
+BABEL_OP3_404_30432_20141126_052839_inLine
+BABEL_OP3_404_30432_20141126_052839_outLine
+BABEL_OP3_404_30497_20150525_194737_inLine
+BABEL_OP3_404_30497_20150525_194737_outLine
+BABEL_OP3_404_30645_20141019_220859_inLine
+BABEL_OP3_404_30653_20150514_014515_inLine
+BABEL_OP3_404_31267_20150615_011004_outLine
+BABEL_OP3_404_31484_20141122_232804_inLine
+BABEL_OP3_404_31484_20141122_232804_outLine
+BABEL_OP3_404_31624_20141105_214349_inLine
+BABEL_OP3_404_31624_20141105_214349_outLine
+BABEL_OP3_404_31919_20150526_220911_inLine
+BABEL_OP3_404_31919_20150526_220911_outLine
+BABEL_OP3_404_32122_20141115_022841_inLine
+BABEL_OP3_404_32122_20141115_022841_outLine
+BABEL_OP3_404_32287_20150210_060823_inLine
+BABEL_OP3_404_32287_20150210_060823_outLine
+BABEL_OP3_404_32630_20150609_012137_inLine
+BABEL_OP3_404_32630_20150609_012137_outLine
+BABEL_OP3_404_32708_20141106_032826_inLine
+BABEL_OP3_404_32708_20141106_032826_outLine
+BABEL_OP3_404_32727_20141128_203500_inLine
+BABEL_OP3_404_32727_20141128_203500_outLine
+BABEL_OP3_404_32727_20141128_204751_inLine
+BABEL_OP3_404_32727_20141128_204751_outLine
+BABEL_OP3_404_32959_20141201_005331_inLine
+BABEL_OP3_404_32959_20141201_005331_outLine
+BABEL_OP3_404_32998_20141112_054111_inLine
+BABEL_OP3_404_33355_20141019_032024_inLine
+BABEL_OP3_404_33355_20141019_032024_outLine
+BABEL_OP3_404_33355_20141019_034109_inLine
+BABEL_OP3_404_33355_20141019_034109_outLine
+BABEL_OP3_404_33704_20141207_073436_inLine
+BABEL_OP3_404_33704_20141207_073436_outLine
+BABEL_OP3_404_34328_20141119_054513_outLine
+BABEL_OP3_404_34328_20141119_055432_outLine
+BABEL_OP3_404_34679_20141102_052808_inLine
+BABEL_OP3_404_34679_20141102_052808_outLine
+BABEL_OP3_404_34688_20141009_073303_inLine
+BABEL_OP3_404_34688_20141009_073303_outLine
+BABEL_OP3_404_34811_20141109_001009_inLine
+BABEL_OP3_404_34811_20141109_001009_outLine
+BABEL_OP3_404_34899_20150611_060602_outLine
+BABEL_OP3_404_35008_20141201_023042_inLine
+BABEL_OP3_404_35008_20141201_023042_outLine
+BABEL_OP3_404_35143_20141130_181111_inLine
+BABEL_OP3_404_35143_20141130_181111_outLine
+BABEL_OP3_404_35181_20150526_211416_inLine
+BABEL_OP3_404_35181_20150526_211416_outLine
+BABEL_OP3_404_35706_20150523_015900_inLine
+BABEL_OP3_404_35706_20150523_015900_outLine
+BABEL_OP3_404_35786_20150604_015518_inLine
+BABEL_OP3_404_35786_20150604_015518_outLine
+BABEL_OP3_404_36017_20150528_192934_inLine
+BABEL_OP3_404_36017_20150528_192934_outLine
+BABEL_OP3_404_36039_20150526_230125_inLine
+BABEL_OP3_404_36039_20150526_230125_outLine
+BABEL_OP3_404_36059_20150601_023254_inLine
+BABEL_OP3_404_36059_20150601_023254_outLine
+BABEL_OP3_404_36059_20150601_033346_inLine
+BABEL_OP3_404_36059_20150601_033346_outLine
+BABEL_OP3_404_36147_20150211_013803_outLine
+BABEL_OP3_404_36219_20141104_012216_inLine
+BABEL_OP3_404_36219_20141104_012216_outLine
+BABEL_OP3_404_36642_20150610_161207_inLine
+BABEL_OP3_404_36642_20150610_161207_outLine
+BABEL_OP3_404_37290_20141115_050457_inLine
+BABEL_OP3_404_37290_20141115_050457_outLine
+BABEL_OP3_404_37598_20141119_045926_inLine
+BABEL_OP3_404_37598_20141119_045926_outLine
+BABEL_OP3_404_37682_20141101_221445_inLine
+BABEL_OP3_404_37682_20141101_221445_outLine
+BABEL_OP3_404_38125_20150526_233108_inLine
+BABEL_OP3_404_38125_20150526_233108_outLine
+BABEL_OP3_404_38323_20150615_021843_inLine
+BABEL_OP3_404_38340_20141103_231545_inLine
+BABEL_OP3_404_38340_20141103_231545_outLine
+BABEL_OP3_404_38554_20141010_224451_inLine
+BABEL_OP3_404_38554_20141010_224451_outLine
+BABEL_OP3_404_38588_20141118_163844_inLine
+BABEL_OP3_404_38588_20141118_163844_outLine
+BABEL_OP3_404_38664_20141030_175135_inLine
+BABEL_OP3_404_38664_20141030_175135_outLine
+BABEL_OP3_404_38979_20150503_202406_outLine
+BABEL_OP3_404_39099_20150511_053646_outLine
+BABEL_OP3_404_39307_20141022_200554_inLine
+BABEL_OP3_404_39307_20141022_201758_inLine
+BABEL_OP3_404_39426_20150527_181901_outLine
+BABEL_OP3_404_39744_20141023_002710_inLine
+BABEL_OP3_404_39893_20150611_034149_inLine
+BABEL_OP3_404_39920_20150503_205354_outLine
+BABEL_OP3_404_40557_20141127_200639_inLine
+BABEL_OP3_404_40557_20141127_200639_outLine
+BABEL_OP3_404_40939_20150210_212748_inLine
+BABEL_OP3_404_40939_20150210_212748_outLine
+BABEL_OP3_404_41097_20141129_055801_inLine
+BABEL_OP3_404_41097_20141129_055801_outLine
+BABEL_OP3_404_41100_20141021_022126_inLine
+BABEL_OP3_404_41100_20141021_022126_outLine
+BABEL_OP3_404_41272_20150503_232941_inLine
+BABEL_OP3_404_41334_20150617_041322_inLine
+BABEL_OP3_404_41400_20150515_021408_inLine
+BABEL_OP3_404_41609_20141009_013405_inLine
+BABEL_OP3_404_41609_20141009_013405_outLine
+BABEL_OP3_404_41692_20150604_005657_inLine
+BABEL_OP3_404_41692_20150604_005657_outLine
+BABEL_OP3_404_41745_20141114_235452_inLine
+BABEL_OP3_404_41745_20141114_235452_outLine
+BABEL_OP3_404_41958_20141029_212755_inLine
+BABEL_OP3_404_41958_20141029_212755_outLine
+BABEL_OP3_404_42155_20141127_055149_inLine
+BABEL_OP3_404_42619_20141130_012456_outLine
+BABEL_OP3_404_42834_20141125_004837_inLine
+BABEL_OP3_404_42834_20141125_004837_outLine
+BABEL_OP3_404_42883_20150604_035732_inLine
+BABEL_OP3_404_42883_20150604_035732_outLine
+BABEL_OP3_404_43368_20141031_010629_inLine
+BABEL_OP3_404_43368_20141031_010629_outLine
+BABEL_OP3_404_43388_20141114_212210_inLine
+BABEL_OP3_404_43388_20141114_214120_inLine
+BABEL_OP3_404_43588_20150517_233637_inLine
+BABEL_OP3_404_43789_20141120_011327_outLine
+BABEL_OP3_404_44114_20150614_012319_inLine
+BABEL_OP3_404_44114_20150614_012319_outLine
+BABEL_OP3_404_44309_20150525_022635_inLine
+BABEL_OP3_404_44309_20150525_022635_outLine
+BABEL_OP3_404_44477_20141201_180604_inLine
+BABEL_OP3_404_44477_20141201_180604_outLine
+BABEL_OP3_404_44478_20150512_225118_inLine
+BABEL_OP3_404_44847_20141130_221248_inLine
+BABEL_OP3_404_44847_20141130_221248_outLine
+BABEL_OP3_404_45106_20141119_050859_inLine
+BABEL_OP3_404_45106_20141119_050859_outLine
+BABEL_OP3_404_45374_20150122_014830_outLine
+BABEL_OP3_404_45374_20150122_015920_outLine
+BABEL_OP3_404_45459_20150525_020410_inLine
+BABEL_OP3_404_45459_20150525_020410_outLine
+BABEL_OP3_404_45560_20141012_030417_inLine
+BABEL_OP3_404_45560_20141012_030417_outLine
+BABEL_OP3_404_45699_20150205_021829_inLine
+BABEL_OP3_404_45851_20150514_155157_inLine
+BABEL_OP3_404_45851_20150514_155157_outLine
+BABEL_OP3_404_45908_20150515_004218_outLine
+BABEL_OP3_404_46268_20141019_032022_inLine
+BABEL_OP3_404_46268_20141019_032022_outLine
+BABEL_OP3_404_46310_20141015_051100_inLine
+BABEL_OP3_404_46310_20141015_051100_outLine
+BABEL_OP3_404_46315_20141129_012912_inLine
+BABEL_OP3_404_46315_20141129_012912_outLine
+BABEL_OP3_404_46550_20141105_072519_inLine
+BABEL_OP3_404_46550_20141105_072519_outLine
+BABEL_OP3_404_46688_20141015_211329_inLine
+BABEL_OP3_404_46688_20141015_211329_outLine
+BABEL_OP3_404_46712_20141027_224004_inLine
+BABEL_OP3_404_46712_20141027_224004_outLine
+BABEL_OP3_404_46881_20141012_020055_inLine
+BABEL_OP3_404_46881_20141012_020055_outLine
+BABEL_OP3_404_46974_20141128_055136_inLine
+BABEL_OP3_404_46974_20141128_055136_outLine
+BABEL_OP3_404_46976_20141107_183806_inLine
+BABEL_OP3_404_46976_20141107_183806_outLine
+BABEL_OP3_404_47156_20150625_025324_inLine
+BABEL_OP3_404_47156_20150625_025324_outLine
+BABEL_OP3_404_47802_20141110_200430_inLine
+BABEL_OP3_404_47802_20141110_200430_outLine
+BABEL_OP3_404_47823_20141201_044425_inLine
+BABEL_OP3_404_47823_20141201_044425_outLine
+BABEL_OP3_404_48016_20150615_000741_inLine
+BABEL_OP3_404_48016_20150615_000741_outLine
+BABEL_OP3_404_48243_20141023_200903_inLine
+BABEL_OP3_404_48243_20141023_200903_outLine
+BABEL_OP3_404_48610_20141013_011505_inLine
+BABEL_OP3_404_48610_20141013_012904_inLine
+BABEL_OP3_404_48663_20150512_202837_inLine
+BABEL_OP3_404_48663_20150512_202837_outLine
+BABEL_OP3_404_49197_20141117_024730_inLine
+BABEL_OP3_404_49197_20141117_024730_outLine
+BABEL_OP3_404_49306_20150524_003356_inLine
+BABEL_OP3_404_49306_20150524_003356_outLine
+BABEL_OP3_404_49630_20141128_020114_inLine
+BABEL_OP3_404_49630_20141128_020114_outLine
+BABEL_OP3_404_49767_20150613_050113_inLine
+BABEL_OP3_404_49767_20150613_050113_outLine
+BABEL_OP3_404_49775_20141011_005306_inLine
+BABEL_OP3_404_49775_20141011_005306_outLine
+BABEL_OP3_404_49902_20141101_175534_inLine
+BABEL_OP3_404_49902_20141101_175534_outLine
+BABEL_OP3_404_49907_20141103_050534_inLine
+BABEL_OP3_404_49907_20141103_050534_outLine
+BABEL_OP3_404_49945_20150610_154709_inLine
+BABEL_OP3_404_50601_20141127_032527_inLine
+BABEL_OP3_404_50601_20141127_032527_outLine
+BABEL_OP3_404_50745_20150513_162805_inLine
+BABEL_OP3_404_50745_20150513_162805_outLine
+BABEL_OP3_404_50779_20141115_012852_inLine
+BABEL_OP3_404_50779_20141115_012852_outLine
+BABEL_OP3_404_50810_20141007_234432_inLine
+BABEL_OP3_404_50810_20141007_234432_outLine
+BABEL_OP3_404_51015_20141123_193824_inLine
+BABEL_OP3_404_51015_20141123_193824_outLine
+BABEL_OP3_404_51414_20150604_001601_inLine
+BABEL_OP3_404_51414_20150604_001601_outLine
+BABEL_OP3_404_51484_20141202_000325_inLine
+BABEL_OP3_404_51484_20141202_000325_outLine
+BABEL_OP3_404_51701_20150620_010924_outLine
+BABEL_OP3_404_52070_20150620_014422_outLine
+BABEL_OP3_404_52070_20150620_020559_outLine
+BABEL_OP3_404_52246_20141118_035022_inLine
+BABEL_OP3_404_52246_20141118_035022_outLine
+BABEL_OP3_404_52246_20141118_040850_inLine
+BABEL_OP3_404_52246_20141118_040850_outLine
+BABEL_OP3_404_52404_20141125_004855_inLine
+BABEL_OP3_404_52404_20141125_004855_outLine
+BABEL_OP3_404_52725_20150522_222730_inLine
+BABEL_OP3_404_52725_20150522_222730_outLine
+BABEL_OP3_404_53063_20141201_005237_inLine
+BABEL_OP3_404_53063_20141201_005237_outLine
+BABEL_OP3_404_53072_20150518_015132_inLine
+BABEL_OP3_404_53415_20150503_225920_inLine
+BABEL_OP3_404_53415_20150503_225920_outLine
+BABEL_OP3_404_53492_20150525_055025_inLine
+BABEL_OP3_404_53492_20150525_055025_outLine
+BABEL_OP3_404_53665_20150526_004549_inLine
+BABEL_OP3_404_53917_20150503_205456_outLine
+BABEL_OP3_404_53957_20141201_051933_inLine
+BABEL_OP3_404_54104_20141008_214620_inLine
+BABEL_OP3_404_54104_20141008_214620_outLine
+BABEL_OP3_404_54160_20141009_180704_inLine
+BABEL_OP3_404_54160_20141009_180704_outLine
+BABEL_OP3_404_54160_20141009_184719_inLine
+BABEL_OP3_404_54160_20141009_184719_outLine
+BABEL_OP3_404_54160_20141009_185557_inLine
+BABEL_OP3_404_54160_20141009_185557_outLine
+BABEL_OP3_404_54405_20141117_054820_inLine
+BABEL_OP3_404_54405_20141117_054820_outLine
+BABEL_OP3_404_54477_20141211_033627_inLine
+BABEL_OP3_404_54477_20141211_033627_outLine
+BABEL_OP3_404_54744_20141015_012011_inLine
+BABEL_OP3_404_54744_20141015_012011_outLine
+BABEL_OP3_404_55013_20150525_222257_inLine
+BABEL_OP3_404_55013_20150525_222257_outLine
+BABEL_OP3_404_55259_20141029_225631_inLine
+BABEL_OP3_404_55259_20141029_225631_outLine
+BABEL_OP3_404_55267_20141130_212756_inLine
+BABEL_OP3_404_55349_20150523_031602_inLine
+BABEL_OP3_404_55349_20150523_031602_outLine
+BABEL_OP3_404_56019_20150502_020750_inLine
+BABEL_OP3_404_56019_20150502_020750_outLine
+BABEL_OP3_404_56076_20150516_164959_inLine
+BABEL_OP3_404_56076_20150516_164959_outLine
+BABEL_OP3_404_56331_20150526_020747_inLine
+BABEL_OP3_404_56331_20150526_020747_outLine
+BABEL_OP3_404_56743_20141114_223719_inLine
+BABEL_OP3_404_56743_20141114_223719_outLine
+BABEL_OP3_404_57065_20141201_002920_inLine
+BABEL_OP3_404_57219_20150618_045613_inLine
+BABEL_OP3_404_57219_20150618_045613_outLine
+BABEL_OP3_404_57464_20150523_224617_inLine
+BABEL_OP3_404_57542_20150526_233832_inLine
+BABEL_OP3_404_57542_20150526_233832_outLine
+BABEL_OP3_404_57542_20150526_235003_inLine
+BABEL_OP3_404_57542_20150526_235003_outLine
+BABEL_OP3_404_57654_20141023_235628_inLine
+BABEL_OP3_404_57654_20141023_235628_outLine
+BABEL_OP3_404_57678_20141104_023128_inLine
+BABEL_OP3_404_57678_20141104_023128_outLine
+BABEL_OP3_404_57919_20150127_041057_inLine
+BABEL_OP3_404_57919_20150127_041057_outLine
+BABEL_OP3_404_58006_20150526_024205_inLine
+BABEL_OP3_404_58006_20150526_024205_outLine
+BABEL_OP3_404_58026_20150615_004130_inLine
+BABEL_OP3_404_58026_20150615_004130_outLine
+BABEL_OP3_404_58915_20150611_034220_outLine
+BABEL_OP3_404_59262_20141130_212633_inLine
+BABEL_OP3_404_59262_20141130_212633_outLine
+BABEL_OP3_404_59307_20150504_003405_inLine
+BABEL_OP3_404_59307_20150504_003405_outLine
+BABEL_OP3_404_59720_20141029_204612_inLine
+BABEL_OP3_404_59720_20141029_204612_outLine
+BABEL_OP3_404_59864_20150602_014458_inLine
+BABEL_OP3_404_60026_20141008_051633_inLine
+BABEL_OP3_404_60026_20141008_051633_outLine
+BABEL_OP3_404_60299_20150611_040929_inLine
+BABEL_OP3_404_60310_20141130_231532_inLine
+BABEL_OP3_404_60310_20141130_231532_outLine
+BABEL_OP3_404_60352_20141201_060712_inLine
+BABEL_OP3_404_60352_20141201_060712_outLine
+BABEL_OP3_404_60352_20141201_061821_inLine
+BABEL_OP3_404_60352_20141201_061821_outLine
+BABEL_OP3_404_60458_20150609_021527_inLine
+BABEL_OP3_404_60458_20150609_021527_outLine
+BABEL_OP3_404_60474_20141029_182816_inLine
+BABEL_OP3_404_60474_20141029_182816_outLine
+BABEL_OP3_404_60477_20150613_223056_inLine
+BABEL_OP3_404_60477_20150613_224002_inLine
+BABEL_OP3_404_60498_20150606_022221_inLine
+BABEL_OP3_404_60498_20150606_022221_outLine
+BABEL_OP3_404_60626_20141028_212539_inLine
+BABEL_OP3_404_60626_20141028_212539_outLine
+BABEL_OP3_404_60706_20141020_215729_inLine
+BABEL_OP3_404_60706_20141020_215729_outLine
+BABEL_OP3_404_61167_20141030_222711_inLine
+BABEL_OP3_404_61167_20141030_222711_outLine
+BABEL_OP3_404_61219_20141025_193634_inLine
+BABEL_OP3_404_61219_20141025_193634_outLine
+BABEL_OP3_404_61678_20141019_201928_inLine
+BABEL_OP3_404_61678_20141019_201928_outLine
+BABEL_OP3_404_61873_20141108_214852_inLine
+BABEL_OP3_404_61873_20141108_214852_outLine
+BABEL_OP3_404_61888_20150504_171019_inLine
+BABEL_OP3_404_61971_20150525_020101_outLine
+BABEL_OP3_404_62155_20150522_032307_inLine
+BABEL_OP3_404_62155_20150522_032307_outLine
+BABEL_OP3_404_62286_20141105_204359_inLine
+BABEL_OP3_404_62286_20141105_204359_outLine
+BABEL_OP3_404_62360_20150517_033230_inLine
+BABEL_OP3_404_62360_20150517_033230_outLine
+BABEL_OP3_404_62456_20141108_202333_inLine
+BABEL_OP3_404_62456_20141108_202333_outLine
+BABEL_OP3_404_62714_20150522_011337_inLine
+BABEL_OP3_404_62714_20150522_011337_outLine
+BABEL_OP3_404_62724_20141130_200827_inLine
+BABEL_OP3_404_62724_20141130_200827_outLine
+BABEL_OP3_404_62734_20141029_221513_inLine
+BABEL_OP3_404_62734_20141029_221513_outLine
+BABEL_OP3_404_62852_20141013_054854_outLine
+BABEL_OP3_404_63081_20141021_032233_inLine
+BABEL_OP3_404_63081_20141021_032233_outLine
+BABEL_OP3_404_63081_20141021_033457_inLine
+BABEL_OP3_404_63081_20141021_033457_outLine
+BABEL_OP3_404_63084_20141130_221452_inLine
+BABEL_OP3_404_63084_20141130_221452_outLine
+BABEL_OP3_404_63425_20141126_054504_inLine
+BABEL_OP3_404_63481_20141020_221014_outLine
+BABEL_OP3_404_63481_20141020_224225_outLine
+BABEL_OP3_404_63670_20141130_050318_inLine
+BABEL_OP3_404_63670_20141130_050318_outLine
+BABEL_OP3_404_63757_20141111_180721_inLine
+BABEL_OP3_404_63757_20141111_180721_outLine
+BABEL_OP3_404_63906_20150525_050310_inLine
+BABEL_OP3_404_63906_20150525_050310_outLine
+BABEL_OP3_404_63999_20150610_041309_inLine
+BABEL_OP3_404_64014_20150503_032745_inLine
+BABEL_OP3_404_64014_20150503_032745_outLine
+BABEL_OP3_404_64722_20150514_034208_outLine
+BABEL_OP3_404_64759_20141014_044027_inLine
+BABEL_OP3_404_64759_20141014_045519_inLine
+BABEL_OP3_404_64796_20141022_055826_inLine
+BABEL_OP3_404_64870_20141108_192546_inLine
+BABEL_OP3_404_64870_20141108_192546_outLine
+BABEL_OP3_404_65561_20141124_060558_inLine
+BABEL_OP3_404_65561_20141124_060558_outLine
+BABEL_OP3_404_65640_20150528_211835_inLine
+BABEL_OP3_404_65640_20150528_211835_outLine
+BABEL_OP3_404_66045_20141117_035937_inLine
+BABEL_OP3_404_66045_20141117_035937_outLine
+BABEL_OP3_404_66177_20150503_202932_inLine
+BABEL_OP3_404_66177_20150503_202932_outLine
+BABEL_OP3_404_66822_20141117_020953_inLine
+BABEL_OP3_404_66822_20141117_020953_outLine
+BABEL_OP3_404_66967_20141008_202611_inLine
+BABEL_OP3_404_66967_20141008_202611_outLine
+BABEL_OP3_404_67152_20150503_201836_inLine
+BABEL_OP3_404_67152_20150503_201836_outLine
+BABEL_OP3_404_67304_20150211_054416_inLine
+BABEL_OP3_404_67304_20150211_054416_outLine
+BABEL_OP3_404_67552_20141126_011955_inLine
+BABEL_OP3_404_67552_20141126_011955_outLine
+BABEL_OP3_404_67842_20141104_051753_inLine
+BABEL_OP3_404_67842_20141104_051753_outLine
+BABEL_OP3_404_68244_20141119_065540_inLine
+BABEL_OP3_404_68244_20141119_065540_outLine
+BABEL_OP3_404_68306_20141126_180315_inLine
+BABEL_OP3_404_68306_20141126_180315_outLine
+BABEL_OP3_404_68385_20141017_031005_inLine
+BABEL_OP3_404_68385_20141017_031005_outLine
+BABEL_OP3_404_68823_20150212_041147_inLine
+BABEL_OP3_404_68823_20150212_041147_outLine
+BABEL_OP3_404_69096_20150512_165126_inLine
+BABEL_OP3_404_69096_20150512_165126_outLine
+BABEL_OP3_404_69107_20141120_010459_inLine
+BABEL_OP3_404_69107_20141120_010459_outLine
+BABEL_OP3_404_69153_20141130_221412_inLine
+BABEL_OP3_404_69153_20141130_221412_outLine
+BABEL_OP3_404_69153_20141130_222842_inLine
+BABEL_OP3_404_69153_20141130_222842_outLine
+BABEL_OP3_404_69474_20141128_051323_outLine
+BABEL_OP3_404_69574_20141006_023156_inLine
+BABEL_OP3_404_69574_20141006_023156_outLine
+BABEL_OP3_404_69578_20141117_003921_inLine
+BABEL_OP3_404_69578_20141117_003921_outLine
+BABEL_OP3_404_69633_20141129_051648_inLine
+BABEL_OP3_404_69633_20141129_051648_outLine
+BABEL_OP3_404_69636_20141126_061322_inLine
+BABEL_OP3_404_69636_20141126_061322_outLine
+BABEL_OP3_404_69885_20150503_011226_inLine
+BABEL_OP3_404_69885_20150503_011226_outLine
+BABEL_OP3_404_69937_20150620_015912_inLine
+BABEL_OP3_404_69964_20150524_015556_inLine
+BABEL_OP3_404_69964_20150524_015556_outLine
+BABEL_OP3_404_69982_20150625_035440_outLine
+BABEL_OP3_404_70121_20141104_202610_inLine
+BABEL_OP3_404_70121_20141104_202610_outLine
+BABEL_OP3_404_70221_20141124_052004_inLine
+BABEL_OP3_404_70221_20141124_052004_outLine
+BABEL_OP3_404_70282_20141111_000251_inLine
+BABEL_OP3_404_70282_20141111_000251_outLine
+BABEL_OP3_404_70460_20150527_015340_inLine
+BABEL_OP3_404_70460_20150527_015340_outLine
+BABEL_OP3_404_70526_20150501_015444_inLine
+BABEL_OP3_404_70526_20150501_015444_outLine
+BABEL_OP3_404_70713_20150527_013058_inLine
+BABEL_OP3_404_70713_20150527_013058_outLine
+BABEL_OP3_404_70794_20141021_185105_inLine
+BABEL_OP3_404_70794_20141021_185105_outLine
+BABEL_OP3_404_71189_20150523_005918_inLine
+BABEL_OP3_404_71189_20150523_005918_outLine
+BABEL_OP3_404_71263_20141119_234747_inLine
+BABEL_OP3_404_71263_20141119_234747_outLine
+BABEL_OP3_404_71278_20150211_052730_inLine
+BABEL_OP3_404_71278_20150211_052730_outLine
+BABEL_OP3_404_71278_20150211_054040_inLine
+BABEL_OP3_404_71278_20150211_054040_outLine
+BABEL_OP3_404_71333_20141102_023503_inLine
+BABEL_OP3_404_71333_20141102_023503_outLine
+BABEL_OP3_404_71401_20150206_070446_inLine
+BABEL_OP3_404_71401_20150206_070446_outLine
+BABEL_OP3_404_71404_20141023_215509_inLine
+BABEL_OP3_404_71404_20141023_215509_outLine
+BABEL_OP3_404_71460_20150206_015309_outLine
+BABEL_OP3_404_71559_20141210_220929_outLine
+BABEL_OP3_404_71566_20141130_035713_inLine
+BABEL_OP3_404_71566_20141130_035713_outLine
+BABEL_OP3_404_71566_20141130_040359_inLine
+BABEL_OP3_404_71566_20141130_040359_outLine
+BABEL_OP3_404_71780_20141105_055543_inLine
+BABEL_OP3_404_71780_20141105_055543_outLine
+BABEL_OP3_404_72319_20150502_041426_inLine
+BABEL_OP3_404_72319_20150502_041426_outLine
+BABEL_OP3_404_72733_20150515_044419_inLine
+BABEL_OP3_404_72733_20150515_044419_outLine
+BABEL_OP3_404_73072_20141012_012029_inLine
+BABEL_OP3_404_73072_20141012_012029_outLine
+BABEL_OP3_404_73119_20141026_232203_inLine
+BABEL_OP3_404_73119_20141026_232203_outLine
+BABEL_OP3_404_73258_20141117_010123_inLine
+BABEL_OP3_404_73258_20141117_010123_outLine
+BABEL_OP3_404_73485_20150512_234636_inLine
+BABEL_OP3_404_73485_20150512_234636_outLine
+BABEL_OP3_404_73964_20150512_205010_inLine
+BABEL_OP3_404_73964_20150512_205010_outLine
+BABEL_OP3_404_74641_20141108_223951_inLine
+BABEL_OP3_404_74641_20141108_223951_outLine
+BABEL_OP3_404_74728_20150503_042547_inLine
+BABEL_OP3_404_74728_20150503_042547_outLine
+BABEL_OP3_404_74799_20141109_222638_inLine
+BABEL_OP3_404_74799_20141109_222638_outLine
+BABEL_OP3_404_75465_20141129_223330_outLine
+BABEL_OP3_404_75869_20150527_230650_inLine
+BABEL_OP3_404_75869_20150527_230650_outLine
+BABEL_OP3_404_75975_20150127_051140_outLine
+BABEL_OP3_404_76126_20141201_202238_inLine
+BABEL_OP3_404_76126_20141201_202238_outLine
+BABEL_OP3_404_76238_20141129_223455_inLine
+BABEL_OP3_404_76238_20141129_223455_outLine
+BABEL_OP3_404_76372_20150601_014341_inLine
+BABEL_OP3_404_76372_20150601_014341_outLine
+BABEL_OP3_404_76437_20141019_202715_inLine
+BABEL_OP3_404_76437_20141019_202715_outLine
+BABEL_OP3_404_76444_20141127_032124_inLine
+BABEL_OP3_404_76444_20141127_032124_outLine
+BABEL_OP3_404_76482_20150618_063131_outLine
+BABEL_OP3_404_76683_20141110_191551_inLine
+BABEL_OP3_404_76683_20141110_191551_outLine
+BABEL_OP3_404_76837_20150124_222250_outLine
+BABEL_OP3_404_76970_20150625_191722_inLine
+BABEL_OP3_404_77126_20141022_202348_inLine
+BABEL_OP3_404_77126_20141022_202348_outLine
+BABEL_OP3_404_77146_20141019_060916_inLine
+BABEL_OP3_404_77242_20150612_024655_inLine
+BABEL_OP3_404_77391_20141026_222314_inLine
+BABEL_OP3_404_77391_20141026_222314_outLine
+BABEL_OP3_404_77427_20141030_192713_inLine
+BABEL_OP3_404_77427_20141030_192713_outLine
+BABEL_OP3_404_77567_20141021_021210_inLine
+BABEL_OP3_404_77567_20141021_021210_outLine
+BABEL_OP3_404_77730_20141014_201059_inLine
+BABEL_OP3_404_77730_20141014_201059_outLine
+BABEL_OP3_404_77803_20141020_030844_inLine
+BABEL_OP3_404_77803_20141020_030844_outLine
+BABEL_OP3_404_77990_20141024_215822_inLine
+BABEL_OP3_404_77990_20141024_215822_outLine
+BABEL_OP3_404_78016_20141029_233059_inLine
+BABEL_OP3_404_78016_20141029_233059_outLine
+BABEL_OP3_404_78254_20141025_202742_inLine
+BABEL_OP3_404_78254_20141025_202742_outLine
+BABEL_OP3_404_78254_20141025_204922_inLine
+BABEL_OP3_404_78254_20141025_204922_outLine
+BABEL_OP3_404_78454_20141115_043455_inLine
+BABEL_OP3_404_78749_20150620_025728_inLine
+BABEL_OP3_404_78749_20150620_025728_outLine
+BABEL_OP3_404_78976_20141025_183704_inLine
+BABEL_OP3_404_78976_20141025_183704_outLine
+BABEL_OP3_404_79190_20141108_232204_inLine
+BABEL_OP3_404_79190_20141108_232204_outLine
+BABEL_OP3_404_79590_20141129_025808_outLine
+BABEL_OP3_404_79751_20141101_232250_inLine
+BABEL_OP3_404_79751_20141101_232250_outLine
+BABEL_OP3_404_79820_20141104_045340_inLine
+BABEL_OP3_404_79820_20141104_045340_outLine
+BABEL_OP3_404_79858_20141015_200446_inLine
+BABEL_OP3_404_79898_20150620_022648_inLine
+BABEL_OP3_404_79898_20150620_022648_outLine
+BABEL_OP3_404_79898_20150620_024014_inLine
+BABEL_OP3_404_79898_20150620_024014_outLine
+BABEL_OP3_404_80069_20150614_233606_inLine
+BABEL_OP3_404_80069_20150614_233606_outLine
+BABEL_OP3_404_80306_20141119_003833_inLine
+BABEL_OP3_404_80306_20141119_003833_outLine
+BABEL_OP3_404_80306_20141119_005121_inLine
+BABEL_OP3_404_80306_20141119_005121_outLine
+BABEL_OP3_404_80439_20141026_005410_inLine
+BABEL_OP3_404_80439_20141026_005410_outLine
+BABEL_OP3_404_80559_20141022_010255_inLine
+BABEL_OP3_404_80655_20150525_221544_inLine
+BABEL_OP3_404_80655_20150525_221544_outLine
+BABEL_OP3_404_80897_20141119_233718_inLine
+BABEL_OP3_404_80897_20141119_233718_outLine
+BABEL_OP3_404_81149_20150525_003741_inLine
+BABEL_OP3_404_81149_20150525_003741_outLine
+BABEL_OP3_404_81213_20141102_205052_inLine
+BABEL_OP3_404_81213_20141102_205052_outLine
+BABEL_OP3_404_81229_20141117_041745_inLine
+BABEL_OP3_404_81229_20141117_041745_outLine
+BABEL_OP3_404_81427_20141030_015136_inLine
+BABEL_OP3_404_81427_20141030_015136_outLine
+BABEL_OP3_404_81854_20150610_060437_inLine
+BABEL_OP3_404_82089_20141117_045302_inLine
+BABEL_OP3_404_82089_20141117_045302_outLine
+BABEL_OP3_404_82303_20150614_024236_inLine
+BABEL_OP3_404_82303_20150614_024236_outLine
+BABEL_OP3_404_82473_20141026_060037_inLine
+BABEL_OP3_404_82473_20141026_060037_outLine
+BABEL_OP3_404_82626_20150615_014517_inLine
+BABEL_OP3_404_82637_20141021_010105_inLine
+BABEL_OP3_404_82637_20141021_010105_outLine
+BABEL_OP3_404_82742_20141201_234306_inLine
+BABEL_OP3_404_82742_20141201_234306_outLine
+BABEL_OP3_404_82863_20141119_044230_inLine
+BABEL_OP3_404_82863_20141119_044230_outLine
+BABEL_OP3_404_83238_20141119_180953_inLine
+BABEL_OP3_404_83238_20141119_180953_outLine
+BABEL_OP3_404_83366_20141120_192208_inLine
+BABEL_OP3_404_83366_20141120_192208_outLine
+BABEL_OP3_404_83651_20141102_170912_inLine
+BABEL_OP3_404_83651_20141102_170912_outLine
+BABEL_OP3_404_83771_20150604_012300_outLine
+BABEL_OP3_404_83851_20141028_203735_inLine
+BABEL_OP3_404_83851_20141028_203735_outLine
+BABEL_OP3_404_83929_20141018_184023_inLine
+BABEL_OP3_404_83929_20141018_184023_outLine
+BABEL_OP3_404_83974_20150617_022055_inLine
+BABEL_OP3_404_84055_20150504_002015_inLine
+BABEL_OP3_404_84055_20150504_002015_outLine
+BABEL_OP3_404_84061_20141030_205021_inLine
+BABEL_OP3_404_84061_20141030_205021_outLine
+BABEL_OP3_404_84125_20141018_023340_inLine
+BABEL_OP3_404_84125_20141018_023340_outLine
+BABEL_OP3_404_84458_20141130_053628_outLine
+BABEL_OP3_404_84815_20141127_011952_inLine
+BABEL_OP3_404_84815_20141127_013345_inLine
+BABEL_OP3_404_85047_20141117_014630_inLine
+BABEL_OP3_404_85047_20141117_014630_outLine
+BABEL_OP3_404_85048_20141127_023704_inLine
+BABEL_OP3_404_85048_20141127_023704_outLine
+BABEL_OP3_404_85254_20150620_035606_inLine
+BABEL_OP3_404_85254_20150620_035606_outLine
+BABEL_OP3_404_85322_20141008_235518_inLine
+BABEL_OP3_404_85322_20141008_235518_outLine
+BABEL_OP3_404_85340_20141103_022707_inLine
+BABEL_OP3_404_85340_20141103_022707_outLine
+BABEL_OP3_404_85651_20141211_032650_inLine
+BABEL_OP3_404_85651_20141211_032650_outLine
+BABEL_OP3_404_86472_20141201_011325_inLine
+BABEL_OP3_404_86472_20141201_011325_outLine
+BABEL_OP3_404_86597_20150612_170328_inLine
+BABEL_OP3_404_86597_20150612_170328_outLine
+BABEL_OP3_404_86635_20141127_204158_inLine
+BABEL_OP3_404_86635_20141127_204158_outLine
+BABEL_OP3_404_86722_20141029_192140_inLine
+BABEL_OP3_404_86722_20141029_192140_outLine
+BABEL_OP3_404_87074_20141105_190107_outLine
+BABEL_OP3_404_87470_20141114_214639_inLine
+BABEL_OP3_404_87470_20141114_214639_outLine
+BABEL_OP3_404_87629_20141127_020403_inLine
+BABEL_OP3_404_87629_20141127_020403_outLine
+BABEL_OP3_404_87777_20141127_040747_inLine
+BABEL_OP3_404_87777_20141127_040747_outLine
+BABEL_OP3_404_87871_20141201_023608_inLine
+BABEL_OP3_404_87871_20141201_023608_outLine
+BABEL_OP3_404_87921_20141201_023029_inLine
+BABEL_OP3_404_87921_20141201_023029_outLine
+BABEL_OP3_404_88260_20141103_234824_inLine
+BABEL_OP3_404_88260_20141103_234824_outLine
+BABEL_OP3_404_88445_20141119_043713_inLine
+BABEL_OP3_404_88445_20141119_043713_outLine
+BABEL_OP3_404_88661_20141127_025208_inLine
+BABEL_OP3_404_88661_20141127_025208_outLine
+BABEL_OP3_404_88669_20141119_000147_inLine
+BABEL_OP3_404_88669_20141119_000147_outLine
+BABEL_OP3_404_88783_20141201_045305_inLine
+BABEL_OP3_404_88783_20141201_045305_outLine
+BABEL_OP3_404_88873_20141028_190127_inLine
+BABEL_OP3_404_88873_20141028_190127_outLine
+BABEL_OP3_404_89045_20141022_193202_inLine
+BABEL_OP3_404_89045_20141022_193202_outLine
+BABEL_OP3_404_89330_20150616_002908_inLine
+BABEL_OP3_404_89330_20150616_002908_outLine
+BABEL_OP3_404_89372_20141010_000950_inLine
+BABEL_OP3_404_89372_20141010_000950_outLine
+BABEL_OP3_404_89650_20150220_222402_inLine
+BABEL_OP3_404_89650_20150220_222402_outLine
+BABEL_OP3_404_89650_20150220_224606_inLine
+BABEL_OP3_404_89650_20150220_224606_outLine
+BABEL_OP3_404_89665_20141103_202723_inLine
+BABEL_OP3_404_89665_20141103_202723_outLine
+BABEL_OP3_404_89943_20141105_211847_outLine
+BABEL_OP3_404_90347_20141119_012016_inLine
+BABEL_OP3_404_90347_20141119_012016_outLine
+BABEL_OP3_404_90760_20150611_151739_inLine
+BABEL_OP3_404_90760_20150611_151739_outLine
+BABEL_OP3_404_90832_20150616_012728_inLine
+BABEL_OP3_404_90832_20150616_012728_outLine
+BABEL_OP3_404_90930_20150119_021352_inLine
+BABEL_OP3_404_90930_20150119_021352_outLine
+BABEL_OP3_404_91383_20150618_035815_inLine
+BABEL_OP3_404_91463_20141116_023036_inLine
+BABEL_OP3_404_91463_20141116_023036_outLine
+BABEL_OP3_404_91475_20150614_034536_inLine
+BABEL_OP3_404_91581_20141129_045608_inLine
+BABEL_OP3_404_91581_20141129_045608_outLine
+BABEL_OP3_404_91581_20141129_050730_inLine
+BABEL_OP3_404_91581_20141129_050730_outLine
+BABEL_OP3_404_91593_20150611_021825_inLine
+BABEL_OP3_404_91593_20150611_021825_outLine
+BABEL_OP3_404_91825_20141009_181224_inLine
+BABEL_OP3_404_91825_20141009_181224_outLine
+BABEL_OP3_404_91825_20141009_183843_inLine
+BABEL_OP3_404_91825_20141009_183843_outLine
+BABEL_OP3_404_91884_20150503_022858_inLine
+BABEL_OP3_404_91884_20150503_022858_outLine
+BABEL_OP3_404_91888_20150512_191012_inLine
+BABEL_OP3_404_91888_20150512_191012_outLine
+BABEL_OP3_404_91891_20141129_005825_inLine
+BABEL_OP3_404_91891_20141129_005825_outLine
+BABEL_OP3_404_91944_20141022_021002_inLine
+BABEL_OP3_404_91971_20150217_041455_inLine
+BABEL_OP3_404_91971_20150217_041455_outLine
+BABEL_OP3_404_91977_20141122_230420_outLine
+BABEL_OP3_404_92176_20141119_195614_inLine
+BABEL_OP3_404_92176_20141119_195614_outLine
+BABEL_OP3_404_92281_20150625_185123_inLine
+BABEL_OP3_404_92698_20141117_072302_inLine
+BABEL_OP3_404_92698_20141117_072302_outLine
+BABEL_OP3_404_92736_20141201_011442_inLine
+BABEL_OP3_404_92736_20141201_011442_outLine
+BABEL_OP3_404_92757_20150525_200048_inLine
+BABEL_OP3_404_92757_20150525_200048_outLine
+BABEL_OP3_404_92792_20150503_182854_outLine
+BABEL_OP3_404_92792_20150525_025523_outLine
+BABEL_OP3_404_92942_20141120_022830_inLine
+BABEL_OP3_404_92942_20141120_022830_outLine
+BABEL_OP3_404_93007_20150615_051230_inLine
+BABEL_OP3_404_93007_20150615_051230_outLine
+BABEL_OP3_404_93858_20150611_043732_inLine
+BABEL_OP3_404_94002_20141119_015307_inLine
+BABEL_OP3_404_94002_20141119_015307_outLine
+BABEL_OP3_404_94025_20141129_180207_inLine
+BABEL_OP3_404_94025_20141129_180207_outLine
+BABEL_OP3_404_94333_20141020_024439_outLine
+BABEL_OP3_404_94487_20150518_005132_outLine
+BABEL_OP3_404_94869_20141007_194254_inLine
+BABEL_OP3_404_94869_20141007_194254_outLine
+BABEL_OP3_404_95077_20141201_055702_outLine
+BABEL_OP3_404_95269_20141105_221810_inLine
+BABEL_OP3_404_95269_20141105_221810_outLine
+BABEL_OP3_404_95338_20150610_211203_inLine
+BABEL_OP3_404_95338_20150610_211203_outLine
+BABEL_OP3_404_95399_20141119_001023_inLine
+BABEL_OP3_404_95399_20141119_001023_outLine
+BABEL_OP3_404_95583_20141019_010741_inLine
+BABEL_OP3_404_95583_20141019_010741_outLine
+BABEL_OP3_404_96059_20150524_042224_outLine
+BABEL_OP3_404_96205_20141119_033053_inLine
+BABEL_OP3_404_96205_20141119_033053_outLine
+BABEL_OP3_404_96205_20141119_034909_inLine
+BABEL_OP3_404_96205_20141119_034909_outLine
+BABEL_OP3_404_96247_20150526_202623_outLine
+BABEL_OP3_404_96376_20150503_033706_inLine
+BABEL_OP3_404_96376_20150503_033706_outLine
+BABEL_OP3_404_96504_20141103_031329_inLine
+BABEL_OP3_404_96504_20141103_031329_outLine
+BABEL_OP3_404_96690_20141117_053054_inLine
+BABEL_OP3_404_96690_20141117_053054_outLine
+BABEL_OP3_404_96808_20150609_034129_inLine
+BABEL_OP3_404_97097_20150601_042649_outLine
+BABEL_OP3_404_97136_20150528_011250_inLine
+BABEL_OP3_404_97136_20150528_011250_outLine
+BABEL_OP3_404_97557_20141119_230718_inLine
+BABEL_OP3_404_97557_20141119_230718_outLine
+BABEL_OP3_404_97588_20141018_234016_inLine
+BABEL_OP3_404_97588_20141018_234016_outLine
+BABEL_OP3_404_97588_20141018_235425_inLine
+BABEL_OP3_404_97588_20141018_235425_outLine
+BABEL_OP3_404_97896_20141116_221329_inLine
+BABEL_OP3_404_97896_20141116_221329_outLine
+BABEL_OP3_404_97911_20150613_195820_outLine
+BABEL_OP3_404_97988_20141201_030306_inLine
+BABEL_OP3_404_97988_20141201_030306_outLine
+BABEL_OP3_404_98165_20141030_214051_inLine
+BABEL_OP3_404_98165_20141030_214051_outLine
+BABEL_OP3_404_98192_20150617_021906_outLine
+BABEL_OP3_404_98489_20141102_002030_inLine
+BABEL_OP3_404_98489_20141102_004054_inLine
+BABEL_OP3_404_98678_20150528_021605_inLine
+BABEL_OP3_404_98678_20150528_023029_inLine
+BABEL_OP3_404_98888_20141113_212715_inLine
+BABEL_OP3_404_98888_20141113_212715_outLine
+BABEL_OP3_404_99202_20141108_210814_inLine
+BABEL_OP3_404_99202_20141108_210814_outLine
+BABEL_OP3_404_99289_20150521_220314_inLine
+BABEL_OP3_404_99289_20150521_220314_outLine
+BABEL_OP3_404_99289_20150521_222144_inLine
+BABEL_OP3_404_99289_20150521_222144_outLine
+BABEL_OP3_404_99594_20141105_194545_inLine
+BABEL_OP3_404_99594_20141105_194545_outLine
+BABEL_OP3_404_99718_20141019_051850_inLine
+BABEL_OP3_404_99718_20141019_051850_outLine
+BABEL_OP3_404_99718_20141019_053305_inLine
+BABEL_OP3_404_99718_20141019_053305_outLine
+BABEL_OP3_404_99732_20141130_232553_inLine
+BABEL_OP3_404_99732_20141130_232553_outLine
+BABEL_OP3_404_99813_20141120_025129_inLine
+BABEL_OP3_404_99813_20141120_025129_outLine
+BABEL_OP3_404_99920_20141022_052026_inLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/training.list b/egs/babel/s5d/conf/lists/404-georgian/training.list
new file mode 100644
index 00000000000..efc0afb8219
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/training.list
@@ -0,0 +1,518 @@
+BABEL_OP3_404_10019_20141101_191932_inLine
+BABEL_OP3_404_10019_20141101_191932_outLine
+BABEL_OP3_404_10416_20141117_064700_inLine
+BABEL_OP3_404_10416_20141117_064700_outLine
+BABEL_OP3_404_10647_20150514_001106_inLine
+BABEL_OP3_404_10647_20150514_001106_outLine
+BABEL_OP3_404_10974_20141119_205506_inLine
+BABEL_OP3_404_10974_20141119_205506_outLine
+BABEL_OP3_404_11663_20141118_032146_inLine
+BABEL_OP3_404_11663_20141118_032146_outLine
+BABEL_OP3_404_11673_20141023_035438_inLine
+BABEL_OP3_404_11673_20141023_035438_outLine
+BABEL_OP3_404_11681_20141107_190101_inLine
+BABEL_OP3_404_11681_20141107_190101_outLine
+BABEL_OP3_404_12242_20141028_021853_inLine
+BABEL_OP3_404_12242_20141028_021853_outLine
+BABEL_OP3_404_13030_20141101_200709_inLine
+BABEL_OP3_404_13030_20141101_200709_outLine
+BABEL_OP3_404_13178_20141129_192909_inLine
+BABEL_OP3_404_13178_20141129_192909_outLine
+BABEL_OP3_404_13324_20141022_200257_inLine
+BABEL_OP3_404_13324_20141022_200257_outLine
+BABEL_OP3_404_13664_20141012_013523_inLine
+BABEL_OP3_404_13664_20141012_013523_outLine
+BABEL_OP3_404_13709_20150512_015216_inLine
+BABEL_OP3_404_13709_20150512_015216_outLine
+BABEL_OP3_404_14137_20141025_202817_inLine
+BABEL_OP3_404_14137_20141025_202817_outLine
+BABEL_OP3_404_14229_20141029_200136_inLine
+BABEL_OP3_404_14229_20141029_200136_outLine
+BABEL_OP3_404_14237_20141006_171921_inLine
+BABEL_OP3_404_14237_20141006_171921_outLine
+BABEL_OP3_404_14440_20141127_213106_inLine
+BABEL_OP3_404_14440_20141127_213106_outLine
+BABEL_OP3_404_14807_20141110_231934_inLine
+BABEL_OP3_404_14807_20141110_231934_outLine
+BABEL_OP3_404_14875_20141026_230227_inLine
+BABEL_OP3_404_14875_20141026_230227_outLine
+BABEL_OP3_404_14899_20141022_202217_inLine
+BABEL_OP3_404_14899_20141022_202217_outLine
+BABEL_OP3_404_14929_20141129_192841_inLine
+BABEL_OP3_404_14929_20141129_192841_outLine
+BABEL_OP3_404_15382_20141130_213942_inLine
+BABEL_OP3_404_15382_20141130_213942_outLine
+BABEL_OP3_404_15848_20141006_231138_inLine
+BABEL_OP3_404_15848_20141006_231138_outLine
+BABEL_OP3_404_15869_20150218_225936_inLine
+BABEL_OP3_404_15869_20150218_225936_outLine
+BABEL_OP3_404_16149_20141010_173548_inLine
+BABEL_OP3_404_16149_20141010_173548_outLine
+BABEL_OP3_404_16467_20141130_014316_inLine
+BABEL_OP3_404_16467_20141130_014316_outLine
+BABEL_OP3_404_16467_20141130_015010_inLine
+BABEL_OP3_404_16467_20141130_015010_outLine
+BABEL_OP3_404_17113_20150611_050102_inLine
+BABEL_OP3_404_17113_20150611_050102_outLine
+BABEL_OP3_404_17280_20141103_190330_inLine
+BABEL_OP3_404_17280_20141103_190330_outLine
+BABEL_OP3_404_17615_20141201_025917_inLine
+BABEL_OP3_404_17615_20141201_025917_outLine
+BABEL_OP3_404_19134_20141120_053128_inLine
+BABEL_OP3_404_19134_20141120_053128_outLine
+BABEL_OP3_404_19703_20141027_004315_inLine
+BABEL_OP3_404_19703_20141027_004315_outLine
+BABEL_OP3_404_20133_20141010_195231_inLine
+BABEL_OP3_404_20133_20141010_195231_outLine
+BABEL_OP3_404_20985_20141126_183236_inLine
+BABEL_OP3_404_20985_20141126_183236_outLine
+BABEL_OP3_404_21004_20141201_035831_inLine
+BABEL_OP3_404_21004_20141201_035831_outLine
+BABEL_OP3_404_22280_20141111_020522_inLine
+BABEL_OP3_404_22280_20141111_020522_outLine
+BABEL_OP3_404_23046_20141031_030755_inLine
+BABEL_OP3_404_23046_20141031_030755_outLine
+BABEL_OP3_404_23505_20141021_032033_inLine
+BABEL_OP3_404_23505_20141021_032033_outLine
+BABEL_OP3_404_23731_20141130_033602_inLine
+BABEL_OP3_404_23731_20141130_033602_outLine
+BABEL_OP3_404_23980_20141106_225951_inLine
+BABEL_OP3_404_23980_20141106_225951_outLine
+BABEL_OP3_404_24270_20141111_012902_inLine
+BABEL_OP3_404_24270_20141111_012902_outLine
+BABEL_OP3_404_24470_20141111_184651_inLine
+BABEL_OP3_404_24470_20141111_184651_outLine
+BABEL_OP3_404_24470_20141111_190229_inLine
+BABEL_OP3_404_24470_20141111_190229_outLine
+BABEL_OP3_404_24532_20141007_211325_inLine
+BABEL_OP3_404_24532_20141007_211325_outLine
+BABEL_OP3_404_24589_20141031_020641_inLine
+BABEL_OP3_404_24589_20141031_020641_outLine
+BABEL_OP3_404_24679_20141018_015615_inLine
+BABEL_OP3_404_24679_20141018_015615_outLine
+BABEL_OP3_404_24982_20141102_021352_inLine
+BABEL_OP3_404_24982_20141102_021352_outLine
+BABEL_OP3_404_26388_20141026_014207_inLine
+BABEL_OP3_404_26388_20141026_014207_outLine
+BABEL_OP3_404_27042_20141201_215107_inLine
+BABEL_OP3_404_27042_20141201_215107_outLine
+BABEL_OP3_404_28303_20141028_182204_inLine
+BABEL_OP3_404_28303_20141028_182204_outLine
+BABEL_OP3_404_28522_20141124_222758_inLine
+BABEL_OP3_404_28522_20141124_222758_outLine
+BABEL_OP3_404_28538_20141119_005526_inLine
+BABEL_OP3_404_28538_20141119_005526_outLine
+BABEL_OP3_404_28871_20141019_181913_inLine
+BABEL_OP3_404_28871_20141019_181913_outLine
+BABEL_OP3_404_29039_20141128_035839_inLine
+BABEL_OP3_404_29039_20141128_035839_outLine
+BABEL_OP3_404_29208_20141106_013309_inLine
+BABEL_OP3_404_29208_20141106_013309_outLine
+BABEL_OP3_404_30098_20150610_150504_inLine
+BABEL_OP3_404_30098_20150610_150504_outLine
+BABEL_OP3_404_30432_20141126_052839_inLine
+BABEL_OP3_404_30432_20141126_052839_outLine
+BABEL_OP3_404_30461_20150620_020316_inLine
+BABEL_OP3_404_30461_20150620_020316_outLine
+BABEL_OP3_404_31624_20141105_214349_inLine
+BABEL_OP3_404_31624_20141105_214349_outLine
+BABEL_OP3_404_31979_20141106_000523_inLine
+BABEL_OP3_404_31979_20141106_000523_outLine
+BABEL_OP3_404_31992_20141014_221817_inLine
+BABEL_OP3_404_31992_20141014_221817_outLine
+BABEL_OP3_404_32122_20141115_022841_inLine
+BABEL_OP3_404_32122_20141115_022841_outLine
+BABEL_OP3_404_32287_20150210_060823_inLine
+BABEL_OP3_404_32287_20150210_060823_outLine
+BABEL_OP3_404_32708_20141106_032826_inLine
+BABEL_OP3_404_32708_20141106_032826_outLine
+BABEL_OP3_404_32727_20141128_203500_inLine
+BABEL_OP3_404_32727_20141128_203500_outLine
+BABEL_OP3_404_32727_20141128_204751_inLine
+BABEL_OP3_404_32727_20141128_204751_outLine
+BABEL_OP3_404_33355_20141019_032024_inLine
+BABEL_OP3_404_33355_20141019_032024_outLine
+BABEL_OP3_404_33355_20141019_034109_inLine
+BABEL_OP3_404_33355_20141019_034109_outLine
+BABEL_OP3_404_33704_20141207_073436_inLine
+BABEL_OP3_404_33704_20141207_073436_outLine
+BABEL_OP3_404_34679_20141102_052808_inLine
+BABEL_OP3_404_34679_20141102_052808_outLine
+BABEL_OP3_404_34688_20141009_073303_inLine
+BABEL_OP3_404_34688_20141009_073303_outLine
+BABEL_OP3_404_35143_20141130_181111_inLine
+BABEL_OP3_404_35143_20141130_181111_outLine
+BABEL_OP3_404_37064_20141102_063308_inLine
+BABEL_OP3_404_37064_20141102_063308_outLine
+BABEL_OP3_404_37281_20141119_053453_inLine
+BABEL_OP3_404_37281_20141119_053453_outLine
+BABEL_OP3_404_37598_20141119_045926_inLine
+BABEL_OP3_404_37598_20141119_045926_outLine
+BABEL_OP3_404_37682_20141101_221445_inLine
+BABEL_OP3_404_37682_20141101_221445_outLine
+BABEL_OP3_404_37853_20150602_030625_inLine
+BABEL_OP3_404_37853_20150602_030625_outLine
+BABEL_OP3_404_38588_20141118_163844_inLine
+BABEL_OP3_404_38588_20141118_163844_outLine
+BABEL_OP3_404_40557_20141127_200639_inLine
+BABEL_OP3_404_40557_20141127_200639_outLine
+BABEL_OP3_404_40713_20141028_221207_inLine
+BABEL_OP3_404_40713_20141028_221207_outLine
+BABEL_OP3_404_40939_20150210_212748_inLine
+BABEL_OP3_404_40939_20150210_212748_outLine
+BABEL_OP3_404_41100_20141021_022126_inLine
+BABEL_OP3_404_41100_20141021_022126_outLine
+BABEL_OP3_404_41609_20141009_013405_inLine
+BABEL_OP3_404_41609_20141009_013405_outLine
+BABEL_OP3_404_41680_20141012_040411_inLine
+BABEL_OP3_404_41680_20141012_040411_outLine
+BABEL_OP3_404_41920_20141008_040539_inLine
+BABEL_OP3_404_41920_20141008_040539_outLine
+BABEL_OP3_404_41958_20141029_212755_inLine
+BABEL_OP3_404_41958_20141029_212755_outLine
+BABEL_OP3_404_42877_20150212_052937_inLine
+BABEL_OP3_404_42877_20150212_052937_outLine
+BABEL_OP3_404_43368_20141031_010629_inLine
+BABEL_OP3_404_43368_20141031_010629_outLine
+BABEL_OP3_404_44114_20150614_012319_inLine
+BABEL_OP3_404_44114_20150614_012319_outLine
+BABEL_OP3_404_44477_20141201_180604_inLine
+BABEL_OP3_404_44477_20141201_180604_outLine
+BABEL_OP3_404_44847_20141130_221248_inLine
+BABEL_OP3_404_44847_20141130_221248_outLine
+BABEL_OP3_404_45121_20150609_055234_inLine
+BABEL_OP3_404_45121_20150609_055234_outLine
+BABEL_OP3_404_45560_20141012_030417_inLine
+BABEL_OP3_404_45560_20141012_030417_outLine
+BABEL_OP3_404_46169_20141130_224339_inLine
+BABEL_OP3_404_46169_20141130_224339_outLine
+BABEL_OP3_404_46268_20141019_032022_inLine
+BABEL_OP3_404_46268_20141019_032022_outLine
+BABEL_OP3_404_46550_20141105_072519_inLine
+BABEL_OP3_404_46550_20141105_072519_outLine
+BABEL_OP3_404_46625_20141011_040505_inLine
+BABEL_OP3_404_46625_20141011_040505_outLine
+BABEL_OP3_404_46681_20141021_040451_inLine
+BABEL_OP3_404_46681_20141021_040451_outLine
+BABEL_OP3_404_46881_20141012_020055_inLine
+BABEL_OP3_404_46881_20141012_020055_outLine
+BABEL_OP3_404_46976_20141107_183806_inLine
+BABEL_OP3_404_46976_20141107_183806_outLine
+BABEL_OP3_404_47270_20150512_053415_inLine
+BABEL_OP3_404_47270_20150512_053415_outLine
+BABEL_OP3_404_47802_20141110_200430_inLine
+BABEL_OP3_404_47802_20141110_200430_outLine
+BABEL_OP3_404_48243_20141023_200903_inLine
+BABEL_OP3_404_48243_20141023_200903_outLine
+BABEL_OP3_404_48844_20141020_065414_inLine
+BABEL_OP3_404_48844_20141020_065414_outLine
+BABEL_OP3_404_49197_20141117_024730_inLine
+BABEL_OP3_404_49197_20141117_024730_outLine
+BABEL_OP3_404_49768_20141026_022902_inLine
+BABEL_OP3_404_49768_20141026_022902_outLine
+BABEL_OP3_404_49902_20141101_175534_inLine
+BABEL_OP3_404_49902_20141101_175534_outLine
+BABEL_OP3_404_49907_20141103_050534_inLine
+BABEL_OP3_404_49907_20141103_050534_outLine
+BABEL_OP3_404_50175_20141021_025726_inLine
+BABEL_OP3_404_50175_20141021_025726_outLine
+BABEL_OP3_404_50745_20150513_162805_inLine
+BABEL_OP3_404_50745_20150513_162805_outLine
+BABEL_OP3_404_51015_20141123_193824_inLine
+BABEL_OP3_404_51015_20141123_193824_outLine
+BABEL_OP3_404_52246_20141118_035022_inLine
+BABEL_OP3_404_52246_20141118_035022_outLine
+BABEL_OP3_404_52246_20141118_040850_inLine
+BABEL_OP3_404_52246_20141118_040850_outLine
+BABEL_OP3_404_52301_20141009_051739_inLine
+BABEL_OP3_404_52301_20141009_051739_outLine
+BABEL_OP3_404_52301_20141009_054049_inLine
+BABEL_OP3_404_52301_20141009_054049_outLine
+BABEL_OP3_404_52490_20141016_020323_inLine
+BABEL_OP3_404_52490_20141016_020323_outLine
+BABEL_OP3_404_52725_20150522_222730_inLine
+BABEL_OP3_404_52725_20150522_222730_outLine
+BABEL_OP3_404_54104_20141008_214620_inLine
+BABEL_OP3_404_54104_20141008_214620_outLine
+BABEL_OP3_404_54160_20141009_180704_inLine
+BABEL_OP3_404_54160_20141009_180704_outLine
+BABEL_OP3_404_54160_20141009_184719_inLine
+BABEL_OP3_404_54160_20141009_184719_outLine
+BABEL_OP3_404_54160_20141009_185557_inLine
+BABEL_OP3_404_54160_20141009_185557_outLine
+BABEL_OP3_404_54405_20141117_054820_inLine
+BABEL_OP3_404_54405_20141117_054820_outLine
+BABEL_OP3_404_54744_20141015_012011_inLine
+BABEL_OP3_404_54744_20141015_012011_outLine
+BABEL_OP3_404_55259_20141029_225631_inLine
+BABEL_OP3_404_55259_20141029_225631_outLine
+BABEL_OP3_404_56213_20141201_000837_inLine
+BABEL_OP3_404_56213_20141201_000837_outLine
+BABEL_OP3_404_57654_20141023_235628_inLine
+BABEL_OP3_404_57654_20141023_235628_outLine
+BABEL_OP3_404_57678_20141104_023128_inLine
+BABEL_OP3_404_57678_20141104_023128_outLine
+BABEL_OP3_404_57919_20150127_041057_inLine
+BABEL_OP3_404_57919_20150127_041057_outLine
+BABEL_OP3_404_58103_20141030_002209_inLine
+BABEL_OP3_404_58103_20141030_002209_outLine
+BABEL_OP3_404_59078_20141111_004941_inLine
+BABEL_OP3_404_59078_20141111_004941_outLine
+BABEL_OP3_404_59262_20141130_212633_inLine
+BABEL_OP3_404_59262_20141130_212633_outLine
+BABEL_OP3_404_59720_20141029_204612_inLine
+BABEL_OP3_404_59720_20141029_204612_outLine
+BABEL_OP3_404_60026_20141008_051633_inLine
+BABEL_OP3_404_60026_20141008_051633_outLine
+BABEL_OP3_404_60474_20141029_182816_inLine
+BABEL_OP3_404_60474_20141029_182816_outLine
+BABEL_OP3_404_60626_20141028_212539_inLine
+BABEL_OP3_404_60626_20141028_212539_outLine
+BABEL_OP3_404_61167_20141030_222711_inLine
+BABEL_OP3_404_61167_20141030_222711_outLine
+BABEL_OP3_404_61219_20141025_193634_inLine
+BABEL_OP3_404_61219_20141025_193634_outLine
+BABEL_OP3_404_61225_20141009_174003_inLine
+BABEL_OP3_404_61225_20141009_174003_outLine
+BABEL_OP3_404_61678_20141019_201928_inLine
+BABEL_OP3_404_61678_20141019_201928_outLine
+BABEL_OP3_404_61873_20141108_214852_inLine
+BABEL_OP3_404_61873_20141108_214852_outLine
+BABEL_OP3_404_62155_20150522_032307_inLine
+BABEL_OP3_404_62155_20150522_032307_outLine
+BABEL_OP3_404_62286_20141105_204359_inLine
+BABEL_OP3_404_62286_20141105_204359_outLine
+BABEL_OP3_404_62456_20141108_202333_inLine
+BABEL_OP3_404_62456_20141108_202333_outLine
+BABEL_OP3_404_62714_20150522_011337_inLine
+BABEL_OP3_404_62714_20150522_011337_outLine
+BABEL_OP3_404_62734_20141029_221513_inLine
+BABEL_OP3_404_62734_20141029_221513_outLine
+BABEL_OP3_404_63081_20141021_032233_inLine
+BABEL_OP3_404_63081_20141021_032233_outLine
+BABEL_OP3_404_63081_20141021_033457_inLine
+BABEL_OP3_404_63081_20141021_033457_outLine
+BABEL_OP3_404_63084_20141130_221452_inLine
+BABEL_OP3_404_63084_20141130_221452_outLine
+BABEL_OP3_404_63220_20141127_033605_inLine
+BABEL_OP3_404_63220_20141127_033605_outLine
+BABEL_OP3_404_63757_20141111_180721_inLine
+BABEL_OP3_404_63757_20141111_180721_outLine
+BABEL_OP3_404_64494_20141026_203549_inLine
+BABEL_OP3_404_64494_20141026_203549_outLine
+BABEL_OP3_404_64768_20141027_201818_inLine
+BABEL_OP3_404_64768_20141027_201818_outLine
+BABEL_OP3_404_64870_20141108_192546_inLine
+BABEL_OP3_404_64870_20141108_192546_outLine
+BABEL_OP3_404_66045_20141117_035937_inLine
+BABEL_OP3_404_66045_20141117_035937_outLine
+BABEL_OP3_404_66177_20150503_202932_inLine
+BABEL_OP3_404_66177_20150503_202932_outLine
+BABEL_OP3_404_66822_20141117_020953_inLine
+BABEL_OP3_404_66822_20141117_020953_outLine
+BABEL_OP3_404_66916_20141022_000731_inLine
+BABEL_OP3_404_66916_20141022_000731_outLine
+BABEL_OP3_404_67401_20141109_211809_inLine
+BABEL_OP3_404_67401_20141109_211809_outLine
+BABEL_OP3_404_67842_20141104_051753_inLine
+BABEL_OP3_404_67842_20141104_051753_outLine
+BABEL_OP3_404_68059_20141109_052011_inLine
+BABEL_OP3_404_68059_20141109_052011_outLine
+BABEL_OP3_404_68068_20141201_054518_inLine
+BABEL_OP3_404_68068_20141201_054518_outLine
+BABEL_OP3_404_68244_20141119_065540_inLine
+BABEL_OP3_404_68244_20141119_065540_outLine
+BABEL_OP3_404_68384_20141130_035214_inLine
+BABEL_OP3_404_68384_20141130_035214_outLine
+BABEL_OP3_404_68385_20141017_031005_inLine
+BABEL_OP3_404_68385_20141017_031005_outLine
+BABEL_OP3_404_68627_20141105_190511_inLine
+BABEL_OP3_404_68627_20141105_190511_outLine
+BABEL_OP3_404_68823_20150212_041147_inLine
+BABEL_OP3_404_68823_20150212_041147_outLine
+BABEL_OP3_404_69107_20141120_010459_inLine
+BABEL_OP3_404_69107_20141120_010459_outLine
+BABEL_OP3_404_69574_20141006_023156_inLine
+BABEL_OP3_404_69574_20141006_023156_outLine
+BABEL_OP3_404_69578_20141117_003921_inLine
+BABEL_OP3_404_69578_20141117_003921_outLine
+BABEL_OP3_404_70121_20141104_202610_inLine
+BABEL_OP3_404_70121_20141104_202610_outLine
+BABEL_OP3_404_70282_20141111_000251_inLine
+BABEL_OP3_404_70282_20141111_000251_outLine
+BABEL_OP3_404_70794_20141021_185105_inLine
+BABEL_OP3_404_70794_20141021_185105_outLine
+BABEL_OP3_404_71263_20141119_234747_inLine
+BABEL_OP3_404_71263_20141119_234747_outLine
+BABEL_OP3_404_71401_20150206_070446_inLine
+BABEL_OP3_404_71401_20150206_070446_outLine
+BABEL_OP3_404_71404_20141023_215509_inLine
+BABEL_OP3_404_71404_20141023_215509_outLine
+BABEL_OP3_404_71566_20141130_035713_inLine
+BABEL_OP3_404_71566_20141130_035713_outLine
+BABEL_OP3_404_71566_20141130_040359_inLine
+BABEL_OP3_404_71566_20141130_040359_outLine
+BABEL_OP3_404_72844_20141007_033837_inLine
+BABEL_OP3_404_72844_20141007_033837_outLine
+BABEL_OP3_404_73119_20141026_232203_inLine
+BABEL_OP3_404_73119_20141026_232203_outLine
+BABEL_OP3_404_73485_20150512_234636_inLine
+BABEL_OP3_404_73485_20150512_234636_outLine
+BABEL_OP3_404_73837_20141026_191037_inLine
+BABEL_OP3_404_73837_20141026_191037_outLine
+BABEL_OP3_404_74641_20141108_223951_inLine
+BABEL_OP3_404_74641_20141108_223951_outLine
+BABEL_OP3_404_74799_20141109_222638_inLine
+BABEL_OP3_404_74799_20141109_222638_outLine
+BABEL_OP3_404_75869_20150527_230650_inLine
+BABEL_OP3_404_75869_20150527_230650_outLine
+BABEL_OP3_404_76437_20141019_202715_inLine
+BABEL_OP3_404_76437_20141019_202715_outLine
+BABEL_OP3_404_77126_20141022_202348_inLine
+BABEL_OP3_404_77126_20141022_202348_outLine
+BABEL_OP3_404_77391_20141026_222314_inLine
+BABEL_OP3_404_77391_20141026_222314_outLine
+BABEL_OP3_404_77427_20141030_192713_inLine
+BABEL_OP3_404_77427_20141030_192713_outLine
+BABEL_OP3_404_77730_20141014_201059_inLine
+BABEL_OP3_404_77730_20141014_201059_outLine
+BABEL_OP3_404_77990_20141024_215822_inLine
+BABEL_OP3_404_77990_20141024_215822_outLine
+BABEL_OP3_404_78016_20141029_233059_inLine
+BABEL_OP3_404_78016_20141029_233059_outLine
+BABEL_OP3_404_78254_20141025_202742_inLine
+BABEL_OP3_404_78254_20141025_202742_outLine
+BABEL_OP3_404_78254_20141025_204922_inLine
+BABEL_OP3_404_78254_20141025_204922_outLine
+BABEL_OP3_404_78511_20141201_003606_inLine
+BABEL_OP3_404_78511_20141201_003606_outLine
+BABEL_OP3_404_78976_20141025_183704_inLine
+BABEL_OP3_404_78976_20141025_183704_outLine
+BABEL_OP3_404_79139_20141117_054733_inLine
+BABEL_OP3_404_79139_20141117_054733_outLine
+BABEL_OP3_404_79751_20141101_232250_inLine
+BABEL_OP3_404_79751_20141101_232250_outLine
+BABEL_OP3_404_80439_20141026_005410_inLine
+BABEL_OP3_404_80439_20141026_005410_outLine
+BABEL_OP3_404_81213_20141102_205052_inLine
+BABEL_OP3_404_81213_20141102_205052_outLine
+BABEL_OP3_404_81229_20141117_041745_inLine
+BABEL_OP3_404_81229_20141117_041745_outLine
+BABEL_OP3_404_81971_20141022_025641_inLine
+BABEL_OP3_404_81971_20141022_025641_outLine
+BABEL_OP3_404_82089_20141117_045302_inLine
+BABEL_OP3_404_82089_20141117_045302_outLine
+BABEL_OP3_404_82303_20150614_024236_inLine
+BABEL_OP3_404_82303_20150614_024236_outLine
+BABEL_OP3_404_82473_20141026_060037_inLine
+BABEL_OP3_404_82473_20141026_060037_outLine
+BABEL_OP3_404_82637_20141021_010105_inLine
+BABEL_OP3_404_82637_20141021_010105_outLine
+BABEL_OP3_404_82742_20141201_234306_inLine
+BABEL_OP3_404_82742_20141201_234306_outLine
+BABEL_OP3_404_83062_20150523_220236_inLine
+BABEL_OP3_404_83062_20150523_220236_outLine
+BABEL_OP3_404_83238_20141119_180953_inLine
+BABEL_OP3_404_83238_20141119_180953_outLine
+BABEL_OP3_404_83366_20141120_192208_inLine
+BABEL_OP3_404_83366_20141120_192208_outLine
+BABEL_OP3_404_83775_20141030_230742_inLine
+BABEL_OP3_404_83775_20141030_230742_outLine
+BABEL_OP3_404_83851_20141028_203735_inLine
+BABEL_OP3_404_83851_20141028_203735_outLine
+BABEL_OP3_404_83929_20141018_184023_inLine
+BABEL_OP3_404_83929_20141018_184023_outLine
+BABEL_OP3_404_84055_20150504_002015_inLine
+BABEL_OP3_404_84055_20150504_002015_outLine
+BABEL_OP3_404_84061_20141030_205021_inLine
+BABEL_OP3_404_84061_20141030_205021_outLine
+BABEL_OP3_404_84339_20150502_014143_inLine
+BABEL_OP3_404_84339_20150502_014143_outLine
+BABEL_OP3_404_85048_20141127_023704_inLine
+BABEL_OP3_404_85048_20141127_023704_outLine
+BABEL_OP3_404_85254_20150620_035606_inLine
+BABEL_OP3_404_85254_20150620_035606_outLine
+BABEL_OP3_404_85322_20141008_235518_inLine
+BABEL_OP3_404_85322_20141008_235518_outLine
+BABEL_OP3_404_85651_20141211_032650_inLine
+BABEL_OP3_404_85651_20141211_032650_outLine
+BABEL_OP3_404_86191_20141027_013544_inLine
+BABEL_OP3_404_86191_20141027_013544_outLine
+BABEL_OP3_404_86472_20141201_011325_inLine
+BABEL_OP3_404_86472_20141201_011325_outLine
+BABEL_OP3_404_86635_20141127_204158_inLine
+BABEL_OP3_404_86635_20141127_204158_outLine
+BABEL_OP3_404_86722_20141029_192140_inLine
+BABEL_OP3_404_86722_20141029_192140_outLine
+BABEL_OP3_404_86888_20141119_022459_inLine
+BABEL_OP3_404_86888_20141119_022459_outLine
+BABEL_OP3_404_87470_20141114_214639_inLine
+BABEL_OP3_404_87470_20141114_214639_outLine
+BABEL_OP3_404_87629_20141127_020403_inLine
+BABEL_OP3_404_87629_20141127_020403_outLine
+BABEL_OP3_404_88260_20141103_234824_inLine
+BABEL_OP3_404_88260_20141103_234824_outLine
+BABEL_OP3_404_88445_20141119_043713_inLine
+BABEL_OP3_404_88445_20141119_043713_outLine
+BABEL_OP3_404_88661_20141127_025208_inLine
+BABEL_OP3_404_88661_20141127_025208_outLine
+BABEL_OP3_404_88669_20141119_000147_inLine
+BABEL_OP3_404_88669_20141119_000147_outLine
+BABEL_OP3_404_88783_20141201_045305_inLine
+BABEL_OP3_404_88783_20141201_045305_outLine
+BABEL_OP3_404_89045_20141022_193202_inLine
+BABEL_OP3_404_89045_20141022_193202_outLine
+BABEL_OP3_404_89372_20141010_000950_inLine
+BABEL_OP3_404_89372_20141010_000950_outLine
+BABEL_OP3_404_89650_20150220_222402_inLine
+BABEL_OP3_404_89650_20150220_222402_outLine
+BABEL_OP3_404_89650_20150220_224606_inLine
+BABEL_OP3_404_89650_20150220_224606_outLine
+BABEL_OP3_404_89665_20141103_202723_inLine
+BABEL_OP3_404_89665_20141103_202723_outLine
+BABEL_OP3_404_90930_20150119_021352_inLine
+BABEL_OP3_404_90930_20150119_021352_outLine
+BABEL_OP3_404_91463_20141116_023036_inLine
+BABEL_OP3_404_91463_20141116_023036_outLine
+BABEL_OP3_404_91825_20141009_181224_inLine
+BABEL_OP3_404_91825_20141009_181224_outLine
+BABEL_OP3_404_91825_20141009_183843_inLine
+BABEL_OP3_404_91825_20141009_183843_outLine
+BABEL_OP3_404_91971_20150217_041455_inLine
+BABEL_OP3_404_91971_20150217_041455_outLine
+BABEL_OP3_404_92698_20141117_072302_inLine
+BABEL_OP3_404_92698_20141117_072302_outLine
+BABEL_OP3_404_92736_20141201_011442_inLine
+BABEL_OP3_404_92736_20141201_011442_outLine
+BABEL_OP3_404_94025_20141129_180207_inLine
+BABEL_OP3_404_94025_20141129_180207_outLine
+BABEL_OP3_404_94869_20141007_194254_inLine
+BABEL_OP3_404_94869_20141007_194254_outLine
+BABEL_OP3_404_95966_20141129_060246_inLine
+BABEL_OP3_404_95966_20141129_060246_outLine
+BABEL_OP3_404_96376_20150503_033706_inLine
+BABEL_OP3_404_96376_20150503_033706_outLine
+BABEL_OP3_404_96504_20141103_031329_inLine
+BABEL_OP3_404_96504_20141103_031329_outLine
+BABEL_OP3_404_97461_20141118_230730_inLine
+BABEL_OP3_404_97461_20141118_230730_outLine
+BABEL_OP3_404_97557_20141119_230718_inLine
+BABEL_OP3_404_97557_20141119_230718_outLine
+BABEL_OP3_404_97588_20141018_234016_inLine
+BABEL_OP3_404_97588_20141018_234016_outLine
+BABEL_OP3_404_97588_20141018_235425_inLine
+BABEL_OP3_404_97588_20141018_235425_outLine
+BABEL_OP3_404_97896_20141116_221329_inLine
+BABEL_OP3_404_97896_20141116_221329_outLine
+BABEL_OP3_404_97988_20141201_030306_inLine
+BABEL_OP3_404_97988_20141201_030306_outLine
+BABEL_OP3_404_98888_20141113_212715_inLine
+BABEL_OP3_404_98888_20141113_212715_outLine
+BABEL_OP3_404_99202_20141108_210814_inLine
+BABEL_OP3_404_99202_20141108_210814_outLine
+BABEL_OP3_404_99487_20141021_053024_inLine
+BABEL_OP3_404_99487_20141021_053024_outLine
+BABEL_OP3_404_99594_20141105_194545_inLine
+BABEL_OP3_404_99594_20141105_194545_outLine
+BABEL_OP3_404_99813_20141120_025129_inLine
+BABEL_OP3_404_99813_20141120_025129_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/untranscribed-training.list b/egs/babel/s5d/conf/lists/404-georgian/untranscribed-training.list
new file mode 100644
index 00000000000..8d6682cc789
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/untranscribed-training.list
@@ -0,0 +1,535 @@
+BABEL_OP3_404_10058_20150526_034808_inLine
+BABEL_OP3_404_10411_20150611_172027_inLine
+BABEL_OP3_404_10411_20150611_172027_outLine
+BABEL_OP3_404_10938_20141030_023413_inLine
+BABEL_OP3_404_10938_20141030_023413_outLine
+BABEL_OP3_404_11352_20150513_002642_inLine
+BABEL_OP3_404_11352_20150513_002642_outLine
+BABEL_OP3_404_11859_20150611_041737_inLine
+BABEL_OP3_404_11859_20150611_041737_outLine
+BABEL_OP3_404_12220_20141116_205911_inLine
+BABEL_OP3_404_12220_20141116_205911_outLine
+BABEL_OP3_404_12609_20150524_172934_inLine
+BABEL_OP3_404_12609_20150524_172934_outLine
+BABEL_OP3_404_13126_20150524_221540_inLine
+BABEL_OP3_404_13126_20150524_221540_outLine
+BABEL_OP3_404_14158_20141130_030130_inLine
+BABEL_OP3_404_14158_20141130_030130_outLine
+BABEL_OP3_404_15024_20141118_234824_inLine
+BABEL_OP3_404_15024_20141118_234824_outLine
+BABEL_OP3_404_15042_20150506_232829_inLine
+BABEL_OP3_404_15042_20150506_232829_outLine
+BABEL_OP3_404_15535_20141129_021659_inLine
+BABEL_OP3_404_15535_20141129_021659_outLine
+BABEL_OP3_404_15638_20141127_220502_outLine
+BABEL_OP3_404_15902_20141020_173105_outLine
+BABEL_OP3_404_16475_20141116_052010_outLine
+BABEL_OP3_404_16601_20141201_041704_inLine
+BABEL_OP3_404_16601_20141201_041704_outLine
+BABEL_OP3_404_17320_20150524_213213_inLine
+BABEL_OP3_404_17320_20150524_213213_outLine
+BABEL_OP3_404_17420_20150503_201902_inLine
+BABEL_OP3_404_17420_20150503_201902_outLine
+BABEL_OP3_404_17420_20150527_025815_inLine
+BABEL_OP3_404_17420_20150527_025815_outLine
+BABEL_OP3_404_17420_20150527_034621_inLine
+BABEL_OP3_404_17420_20150527_034621_outLine
+BABEL_OP3_404_17520_20141113_032534_inLine
+BABEL_OP3_404_17567_20141117_182919_inLine
+BABEL_OP3_404_17567_20141117_182919_outLine
+BABEL_OP3_404_17573_20141129_035040_inLine
+BABEL_OP3_404_17573_20141129_035040_outLine
+BABEL_OP3_404_17890_20141128_040046_inLine
+BABEL_OP3_404_17890_20141128_040046_outLine
+BABEL_OP3_404_17923_20141022_231429_outLine
+BABEL_OP3_404_18118_20150503_165936_inLine
+BABEL_OP3_404_18118_20150503_165936_outLine
+BABEL_OP3_404_18291_20150611_062705_outLine
+BABEL_OP3_404_18291_20150611_063700_outLine
+BABEL_OP3_404_18766_20150610_064349_inLine
+BABEL_OP3_404_19120_20150525_014657_inLine
+BABEL_OP3_404_19120_20150525_014657_outLine
+BABEL_OP3_404_19120_20150525_015635_inLine
+BABEL_OP3_404_19120_20150525_015635_outLine
+BABEL_OP3_404_19877_20150506_202237_outLine
+BABEL_OP3_404_20454_20150218_171143_inLine
+BABEL_OP3_404_20454_20150218_171143_outLine
+BABEL_OP3_404_21159_20150615_021612_inLine
+BABEL_OP3_404_21435_20150523_030702_inLine
+BABEL_OP3_404_21435_20150523_030702_outLine
+BABEL_OP3_404_21581_20141101_011021_inLine
+BABEL_OP3_404_21581_20141101_011021_outLine
+BABEL_OP3_404_21807_20141112_225225_outLine
+BABEL_OP3_404_22591_20150217_220714_inLine
+BABEL_OP3_404_24209_20150212_224614_inLine
+BABEL_OP3_404_24239_20150517_203015_inLine
+BABEL_OP3_404_24323_20141117_020615_outLine
+BABEL_OP3_404_24501_20150522_030231_inLine
+BABEL_OP3_404_24586_20150524_190657_inLine
+BABEL_OP3_404_24586_20150524_190657_outLine
+BABEL_OP3_404_24590_20141116_230233_inLine
+BABEL_OP3_404_24590_20141116_230233_outLine
+BABEL_OP3_404_25068_20150206_022730_outLine
+BABEL_OP3_404_25085_20150611_040906_inLine
+BABEL_OP3_404_25085_20150611_040906_outLine
+BABEL_OP3_404_25412_20141120_031532_inLine
+BABEL_OP3_404_25412_20141120_031532_outLine
+BABEL_OP3_404_25496_20150613_034126_inLine
+BABEL_OP3_404_25496_20150613_034126_outLine
+BABEL_OP3_404_26398_20150527_032152_inLine
+BABEL_OP3_404_26398_20150527_032152_outLine
+BABEL_OP3_404_26478_20150617_004029_inLine
+BABEL_OP3_404_26478_20150617_004029_outLine
+BABEL_OP3_404_26836_20141102_024528_inLine
+BABEL_OP3_404_26836_20141102_024528_outLine
+BABEL_OP3_404_27203_20141119_185720_inLine
+BABEL_OP3_404_27203_20141119_185720_outLine
+BABEL_OP3_404_27203_20141119_191138_inLine
+BABEL_OP3_404_27203_20141119_191138_outLine
+BABEL_OP3_404_27590_20141128_051454_inLine
+BABEL_OP3_404_28280_20150619_024509_inLine
+BABEL_OP3_404_28280_20150619_024509_outLine
+BABEL_OP3_404_28280_20150619_025848_inLine
+BABEL_OP3_404_28280_20150619_025848_outLine
+BABEL_OP3_404_28600_20141201_223206_inLine
+BABEL_OP3_404_28600_20141201_223206_outLine
+BABEL_OP3_404_28945_20141104_060349_outLine
+BABEL_OP3_404_29076_20141109_215142_inLine
+BABEL_OP3_404_29076_20141109_215142_outLine
+BABEL_OP3_404_29230_20150611_051340_inLine
+BABEL_OP3_404_29230_20150611_051340_outLine
+BABEL_OP3_404_29439_20150524_201524_inLine
+BABEL_OP3_404_29439_20150524_201524_outLine
+BABEL_OP3_404_30497_20150525_194737_inLine
+BABEL_OP3_404_30497_20150525_194737_outLine
+BABEL_OP3_404_30645_20141019_220859_inLine
+BABEL_OP3_404_30653_20150514_014515_inLine
+BABEL_OP3_404_31267_20150615_011004_outLine
+BABEL_OP3_404_31484_20141122_232804_inLine
+BABEL_OP3_404_31484_20141122_232804_outLine
+BABEL_OP3_404_31919_20150526_220911_inLine
+BABEL_OP3_404_31919_20150526_220911_outLine
+BABEL_OP3_404_32630_20150609_012137_inLine
+BABEL_OP3_404_32630_20150609_012137_outLine
+BABEL_OP3_404_32959_20141201_005331_inLine
+BABEL_OP3_404_32959_20141201_005331_outLine
+BABEL_OP3_404_32998_20141112_054111_inLine
+BABEL_OP3_404_34328_20141119_054513_outLine
+BABEL_OP3_404_34328_20141119_055432_outLine
+BABEL_OP3_404_34811_20141109_001009_inLine
+BABEL_OP3_404_34811_20141109_001009_outLine
+BABEL_OP3_404_34899_20150611_060602_outLine
+BABEL_OP3_404_35008_20141201_023042_inLine
+BABEL_OP3_404_35008_20141201_023042_outLine
+BABEL_OP3_404_35181_20150526_211416_inLine
+BABEL_OP3_404_35181_20150526_211416_outLine
+BABEL_OP3_404_35706_20150523_015900_inLine
+BABEL_OP3_404_35706_20150523_015900_outLine
+BABEL_OP3_404_35786_20150604_015518_inLine
+BABEL_OP3_404_35786_20150604_015518_outLine
+BABEL_OP3_404_36017_20150528_192934_inLine
+BABEL_OP3_404_36017_20150528_192934_outLine
+BABEL_OP3_404_36039_20150526_230125_inLine
+BABEL_OP3_404_36039_20150526_230125_outLine
+BABEL_OP3_404_36059_20150601_023254_inLine
+BABEL_OP3_404_36059_20150601_023254_outLine
+BABEL_OP3_404_36059_20150601_033346_inLine
+BABEL_OP3_404_36059_20150601_033346_outLine
+BABEL_OP3_404_36147_20150211_013803_outLine
+BABEL_OP3_404_36219_20141104_012216_inLine
+BABEL_OP3_404_36219_20141104_012216_outLine
+BABEL_OP3_404_36642_20150610_161207_inLine
+BABEL_OP3_404_36642_20150610_161207_outLine
+BABEL_OP3_404_37290_20141115_050457_inLine
+BABEL_OP3_404_37290_20141115_050457_outLine
+BABEL_OP3_404_38125_20150526_233108_inLine
+BABEL_OP3_404_38125_20150526_233108_outLine
+BABEL_OP3_404_38323_20150615_021843_inLine
+BABEL_OP3_404_38340_20141103_231545_inLine
+BABEL_OP3_404_38340_20141103_231545_outLine
+BABEL_OP3_404_38554_20141010_224451_inLine
+BABEL_OP3_404_38554_20141010_224451_outLine
+BABEL_OP3_404_38664_20141030_175135_inLine
+BABEL_OP3_404_38664_20141030_175135_outLine
+BABEL_OP3_404_38979_20150503_202406_outLine
+BABEL_OP3_404_39099_20150511_053646_outLine
+BABEL_OP3_404_39307_20141022_200554_inLine
+BABEL_OP3_404_39307_20141022_201758_inLine
+BABEL_OP3_404_39426_20150527_181901_outLine
+BABEL_OP3_404_39744_20141023_002710_inLine
+BABEL_OP3_404_39893_20150611_034149_inLine
+BABEL_OP3_404_39920_20150503_205354_outLine
+BABEL_OP3_404_41097_20141129_055801_inLine
+BABEL_OP3_404_41097_20141129_055801_outLine
+BABEL_OP3_404_41272_20150503_232941_inLine
+BABEL_OP3_404_41334_20150617_041322_inLine
+BABEL_OP3_404_41400_20150515_021408_inLine
+BABEL_OP3_404_41692_20150604_005657_inLine
+BABEL_OP3_404_41692_20150604_005657_outLine
+BABEL_OP3_404_41745_20141114_235452_inLine
+BABEL_OP3_404_41745_20141114_235452_outLine
+BABEL_OP3_404_42155_20141127_055149_inLine
+BABEL_OP3_404_42619_20141130_012456_outLine
+BABEL_OP3_404_42834_20141125_004837_inLine
+BABEL_OP3_404_42834_20141125_004837_outLine
+BABEL_OP3_404_42883_20150604_035732_inLine
+BABEL_OP3_404_42883_20150604_035732_outLine
+BABEL_OP3_404_43388_20141114_212210_inLine
+BABEL_OP3_404_43388_20141114_214120_inLine
+BABEL_OP3_404_43588_20150517_233637_inLine
+BABEL_OP3_404_43789_20141120_011327_outLine
+BABEL_OP3_404_44309_20150525_022635_inLine
+BABEL_OP3_404_44309_20150525_022635_outLine
+BABEL_OP3_404_44478_20150512_225118_inLine
+BABEL_OP3_404_45106_20141119_050859_inLine
+BABEL_OP3_404_45106_20141119_050859_outLine
+BABEL_OP3_404_45374_20150122_014830_outLine
+BABEL_OP3_404_45374_20150122_015920_outLine
+BABEL_OP3_404_45459_20150525_020410_inLine
+BABEL_OP3_404_45459_20150525_020410_outLine
+BABEL_OP3_404_45699_20150205_021829_inLine
+BABEL_OP3_404_45851_20150514_155157_inLine
+BABEL_OP3_404_45851_20150514_155157_outLine
+BABEL_OP3_404_45908_20150515_004218_outLine
+BABEL_OP3_404_46310_20141015_051100_inLine
+BABEL_OP3_404_46310_20141015_051100_outLine
+BABEL_OP3_404_46315_20141129_012912_inLine
+BABEL_OP3_404_46315_20141129_012912_outLine
+BABEL_OP3_404_46688_20141015_211329_inLine
+BABEL_OP3_404_46688_20141015_211329_outLine
+BABEL_OP3_404_46712_20141027_224004_inLine
+BABEL_OP3_404_46712_20141027_224004_outLine
+BABEL_OP3_404_46974_20141128_055136_inLine
+BABEL_OP3_404_46974_20141128_055136_outLine
+BABEL_OP3_404_47156_20150625_025324_inLine
+BABEL_OP3_404_47156_20150625_025324_outLine
+BABEL_OP3_404_47823_20141201_044425_inLine
+BABEL_OP3_404_47823_20141201_044425_outLine
+BABEL_OP3_404_48016_20150615_000741_inLine
+BABEL_OP3_404_48016_20150615_000741_outLine
+BABEL_OP3_404_48610_20141013_011505_inLine
+BABEL_OP3_404_48610_20141013_012904_inLine
+BABEL_OP3_404_48663_20150512_202837_inLine
+BABEL_OP3_404_48663_20150512_202837_outLine
+BABEL_OP3_404_49306_20150524_003356_inLine
+BABEL_OP3_404_49306_20150524_003356_outLine
+BABEL_OP3_404_49630_20141128_020114_inLine
+BABEL_OP3_404_49630_20141128_020114_outLine
+BABEL_OP3_404_49767_20150613_050113_inLine
+BABEL_OP3_404_49767_20150613_050113_outLine
+BABEL_OP3_404_49775_20141011_005306_inLine
+BABEL_OP3_404_49775_20141011_005306_outLine
+BABEL_OP3_404_49945_20150610_154709_inLine
+BABEL_OP3_404_50601_20141127_032527_inLine
+BABEL_OP3_404_50601_20141127_032527_outLine
+BABEL_OP3_404_50779_20141115_012852_inLine
+BABEL_OP3_404_50779_20141115_012852_outLine
+BABEL_OP3_404_50810_20141007_234432_inLine
+BABEL_OP3_404_50810_20141007_234432_outLine
+BABEL_OP3_404_51414_20150604_001601_inLine
+BABEL_OP3_404_51414_20150604_001601_outLine
+BABEL_OP3_404_51484_20141202_000325_inLine
+BABEL_OP3_404_51484_20141202_000325_outLine
+BABEL_OP3_404_51701_20150620_010924_outLine
+BABEL_OP3_404_52070_20150620_014422_outLine
+BABEL_OP3_404_52070_20150620_020559_outLine
+BABEL_OP3_404_52404_20141125_004855_inLine
+BABEL_OP3_404_52404_20141125_004855_outLine
+BABEL_OP3_404_53063_20141201_005237_inLine
+BABEL_OP3_404_53063_20141201_005237_outLine
+BABEL_OP3_404_53072_20150518_015132_inLine
+BABEL_OP3_404_53415_20150503_225920_inLine
+BABEL_OP3_404_53415_20150503_225920_outLine
+BABEL_OP3_404_53492_20150525_055025_inLine
+BABEL_OP3_404_53492_20150525_055025_outLine
+BABEL_OP3_404_53665_20150526_004549_inLine
+BABEL_OP3_404_53917_20150503_205456_outLine
+BABEL_OP3_404_53957_20141201_051933_inLine
+BABEL_OP3_404_54477_20141211_033627_inLine
+BABEL_OP3_404_54477_20141211_033627_outLine
+BABEL_OP3_404_55013_20150525_222257_inLine
+BABEL_OP3_404_55013_20150525_222257_outLine
+BABEL_OP3_404_55267_20141130_212756_inLine
+BABEL_OP3_404_55349_20150523_031602_inLine
+BABEL_OP3_404_55349_20150523_031602_outLine
+BABEL_OP3_404_56019_20150502_020750_inLine
+BABEL_OP3_404_56019_20150502_020750_outLine
+BABEL_OP3_404_56076_20150516_164959_inLine
+BABEL_OP3_404_56076_20150516_164959_outLine
+BABEL_OP3_404_56331_20150526_020747_inLine
+BABEL_OP3_404_56331_20150526_020747_outLine
+BABEL_OP3_404_56743_20141114_223719_inLine
+BABEL_OP3_404_56743_20141114_223719_outLine
+BABEL_OP3_404_57065_20141201_002920_inLine
+BABEL_OP3_404_57219_20150618_045613_inLine
+BABEL_OP3_404_57219_20150618_045613_outLine
+BABEL_OP3_404_57464_20150523_224617_inLine
+BABEL_OP3_404_57542_20150526_233832_inLine
+BABEL_OP3_404_57542_20150526_233832_outLine
+BABEL_OP3_404_57542_20150526_235003_inLine
+BABEL_OP3_404_57542_20150526_235003_outLine
+BABEL_OP3_404_58006_20150526_024205_inLine
+BABEL_OP3_404_58006_20150526_024205_outLine
+BABEL_OP3_404_58026_20150615_004130_inLine
+BABEL_OP3_404_58026_20150615_004130_outLine
+BABEL_OP3_404_58915_20150611_034220_outLine
+BABEL_OP3_404_59307_20150504_003405_inLine
+BABEL_OP3_404_59307_20150504_003405_outLine
+BABEL_OP3_404_59864_20150602_014458_inLine
+BABEL_OP3_404_60299_20150611_040929_inLine
+BABEL_OP3_404_60310_20141130_231532_inLine
+BABEL_OP3_404_60310_20141130_231532_outLine
+BABEL_OP3_404_60352_20141201_060712_inLine
+BABEL_OP3_404_60352_20141201_060712_outLine
+BABEL_OP3_404_60352_20141201_061821_inLine
+BABEL_OP3_404_60352_20141201_061821_outLine
+BABEL_OP3_404_60458_20150609_021527_inLine
+BABEL_OP3_404_60458_20150609_021527_outLine
+BABEL_OP3_404_60477_20150613_223056_inLine
+BABEL_OP3_404_60477_20150613_224002_inLine
+BABEL_OP3_404_60498_20150606_022221_inLine
+BABEL_OP3_404_60498_20150606_022221_outLine
+BABEL_OP3_404_60706_20141020_215729_inLine
+BABEL_OP3_404_60706_20141020_215729_outLine
+BABEL_OP3_404_61888_20150504_171019_inLine
+BABEL_OP3_404_61971_20150525_020101_outLine
+BABEL_OP3_404_62360_20150517_033230_inLine
+BABEL_OP3_404_62360_20150517_033230_outLine
+BABEL_OP3_404_62724_20141130_200827_inLine
+BABEL_OP3_404_62724_20141130_200827_outLine
+BABEL_OP3_404_62852_20141013_054854_outLine
+BABEL_OP3_404_63425_20141126_054504_inLine
+BABEL_OP3_404_63481_20141020_221014_outLine
+BABEL_OP3_404_63481_20141020_224225_outLine
+BABEL_OP3_404_63670_20141130_050318_inLine
+BABEL_OP3_404_63670_20141130_050318_outLine
+BABEL_OP3_404_63906_20150525_050310_inLine
+BABEL_OP3_404_63906_20150525_050310_outLine
+BABEL_OP3_404_63999_20150610_041309_inLine
+BABEL_OP3_404_64014_20150503_032745_inLine
+BABEL_OP3_404_64014_20150503_032745_outLine
+BABEL_OP3_404_64722_20150514_034208_outLine
+BABEL_OP3_404_64759_20141014_044027_inLine
+BABEL_OP3_404_64759_20141014_045519_inLine
+BABEL_OP3_404_64796_20141022_055826_inLine
+BABEL_OP3_404_65561_20141124_060558_inLine
+BABEL_OP3_404_65561_20141124_060558_outLine
+BABEL_OP3_404_65640_20150528_211835_inLine
+BABEL_OP3_404_65640_20150528_211835_outLine
+BABEL_OP3_404_66967_20141008_202611_inLine
+BABEL_OP3_404_66967_20141008_202611_outLine
+BABEL_OP3_404_67152_20150503_201836_inLine
+BABEL_OP3_404_67152_20150503_201836_outLine
+BABEL_OP3_404_67304_20150211_054416_inLine
+BABEL_OP3_404_67304_20150211_054416_outLine
+BABEL_OP3_404_67552_20141126_011955_inLine
+BABEL_OP3_404_67552_20141126_011955_outLine
+BABEL_OP3_404_68306_20141126_180315_inLine
+BABEL_OP3_404_68306_20141126_180315_outLine
+BABEL_OP3_404_69096_20150512_165126_inLine
+BABEL_OP3_404_69096_20150512_165126_outLine
+BABEL_OP3_404_69153_20141130_221412_inLine
+BABEL_OP3_404_69153_20141130_221412_outLine
+BABEL_OP3_404_69153_20141130_222842_inLine
+BABEL_OP3_404_69153_20141130_222842_outLine
+BABEL_OP3_404_69474_20141128_051323_outLine
+BABEL_OP3_404_69633_20141129_051648_inLine
+BABEL_OP3_404_69633_20141129_051648_outLine
+BABEL_OP3_404_69636_20141126_061322_inLine
+BABEL_OP3_404_69636_20141126_061322_outLine
+BABEL_OP3_404_69885_20150503_011226_inLine
+BABEL_OP3_404_69885_20150503_011226_outLine
+BABEL_OP3_404_69937_20150620_015912_inLine
+BABEL_OP3_404_69964_20150524_015556_inLine
+BABEL_OP3_404_69964_20150524_015556_outLine
+BABEL_OP3_404_69982_20150625_035440_outLine
+BABEL_OP3_404_70221_20141124_052004_inLine
+BABEL_OP3_404_70221_20141124_052004_outLine
+BABEL_OP3_404_70460_20150527_015340_inLine
+BABEL_OP3_404_70460_20150527_015340_outLine
+BABEL_OP3_404_70526_20150501_015444_inLine
+BABEL_OP3_404_70526_20150501_015444_outLine
+BABEL_OP3_404_70713_20150527_013058_inLine
+BABEL_OP3_404_70713_20150527_013058_outLine
+BABEL_OP3_404_71189_20150523_005918_inLine
+BABEL_OP3_404_71189_20150523_005918_outLine
+BABEL_OP3_404_71278_20150211_052730_inLine
+BABEL_OP3_404_71278_20150211_052730_outLine
+BABEL_OP3_404_71278_20150211_054040_inLine
+BABEL_OP3_404_71278_20150211_054040_outLine
+BABEL_OP3_404_71333_20141102_023503_inLine
+BABEL_OP3_404_71333_20141102_023503_outLine
+BABEL_OP3_404_71460_20150206_015309_outLine
+BABEL_OP3_404_71559_20141210_220929_outLine
+BABEL_OP3_404_71780_20141105_055543_inLine
+BABEL_OP3_404_71780_20141105_055543_outLine
+BABEL_OP3_404_72319_20150502_041426_inLine
+BABEL_OP3_404_72319_20150502_041426_outLine
+BABEL_OP3_404_72733_20150515_044419_inLine
+BABEL_OP3_404_72733_20150515_044419_outLine
+BABEL_OP3_404_73072_20141012_012029_inLine
+BABEL_OP3_404_73072_20141012_012029_outLine
+BABEL_OP3_404_73258_20141117_010123_inLine
+BABEL_OP3_404_73258_20141117_010123_outLine
+BABEL_OP3_404_73964_20150512_205010_inLine
+BABEL_OP3_404_73964_20150512_205010_outLine
+BABEL_OP3_404_74728_20150503_042547_inLine
+BABEL_OP3_404_74728_20150503_042547_outLine
+BABEL_OP3_404_75465_20141129_223330_outLine
+BABEL_OP3_404_75975_20150127_051140_outLine
+BABEL_OP3_404_76126_20141201_202238_inLine
+BABEL_OP3_404_76126_20141201_202238_outLine
+BABEL_OP3_404_76238_20141129_223455_inLine
+BABEL_OP3_404_76238_20141129_223455_outLine
+BABEL_OP3_404_76372_20150601_014341_inLine
+BABEL_OP3_404_76372_20150601_014341_outLine
+BABEL_OP3_404_76444_20141127_032124_inLine
+BABEL_OP3_404_76444_20141127_032124_outLine
+BABEL_OP3_404_76482_20150618_063131_outLine
+BABEL_OP3_404_76683_20141110_191551_inLine
+BABEL_OP3_404_76683_20141110_191551_outLine
+BABEL_OP3_404_76837_20150124_222250_outLine
+BABEL_OP3_404_76970_20150625_191722_inLine
+BABEL_OP3_404_77146_20141019_060916_inLine
+BABEL_OP3_404_77242_20150612_024655_inLine
+BABEL_OP3_404_77567_20141021_021210_inLine
+BABEL_OP3_404_77567_20141021_021210_outLine
+BABEL_OP3_404_77803_20141020_030844_inLine
+BABEL_OP3_404_77803_20141020_030844_outLine
+BABEL_OP3_404_78454_20141115_043455_inLine
+BABEL_OP3_404_78749_20150620_025728_inLine
+BABEL_OP3_404_78749_20150620_025728_outLine
+BABEL_OP3_404_79190_20141108_232204_inLine
+BABEL_OP3_404_79190_20141108_232204_outLine
+BABEL_OP3_404_79590_20141129_025808_outLine
+BABEL_OP3_404_79820_20141104_045340_inLine
+BABEL_OP3_404_79820_20141104_045340_outLine
+BABEL_OP3_404_79858_20141015_200446_inLine
+BABEL_OP3_404_79898_20150620_022648_inLine
+BABEL_OP3_404_79898_20150620_022648_outLine
+BABEL_OP3_404_79898_20150620_024014_inLine
+BABEL_OP3_404_79898_20150620_024014_outLine
+BABEL_OP3_404_80069_20150614_233606_inLine
+BABEL_OP3_404_80069_20150614_233606_outLine
+BABEL_OP3_404_80306_20141119_003833_inLine
+BABEL_OP3_404_80306_20141119_003833_outLine
+BABEL_OP3_404_80306_20141119_005121_inLine
+BABEL_OP3_404_80306_20141119_005121_outLine
+BABEL_OP3_404_80559_20141022_010255_inLine
+BABEL_OP3_404_80655_20150525_221544_inLine
+BABEL_OP3_404_80655_20150525_221544_outLine
+BABEL_OP3_404_80897_20141119_233718_inLine
+BABEL_OP3_404_80897_20141119_233718_outLine
+BABEL_OP3_404_81149_20150525_003741_inLine
+BABEL_OP3_404_81149_20150525_003741_outLine
+BABEL_OP3_404_81427_20141030_015136_inLine
+BABEL_OP3_404_81427_20141030_015136_outLine
+BABEL_OP3_404_81854_20150610_060437_inLine
+BABEL_OP3_404_82626_20150615_014517_inLine
+BABEL_OP3_404_82863_20141119_044230_inLine
+BABEL_OP3_404_82863_20141119_044230_outLine
+BABEL_OP3_404_83651_20141102_170912_inLine
+BABEL_OP3_404_83651_20141102_170912_outLine
+BABEL_OP3_404_83771_20150604_012300_outLine
+BABEL_OP3_404_83974_20150617_022055_inLine
+BABEL_OP3_404_84125_20141018_023340_inLine
+BABEL_OP3_404_84125_20141018_023340_outLine
+BABEL_OP3_404_84458_20141130_053628_outLine
+BABEL_OP3_404_84815_20141127_011952_inLine
+BABEL_OP3_404_84815_20141127_013345_inLine
+BABEL_OP3_404_85047_20141117_014630_inLine
+BABEL_OP3_404_85047_20141117_014630_outLine
+BABEL_OP3_404_85340_20141103_022707_inLine
+BABEL_OP3_404_85340_20141103_022707_outLine
+BABEL_OP3_404_86597_20150612_170328_inLine
+BABEL_OP3_404_86597_20150612_170328_outLine
+BABEL_OP3_404_87074_20141105_190107_outLine
+BABEL_OP3_404_87777_20141127_040747_inLine
+BABEL_OP3_404_87777_20141127_040747_outLine
+BABEL_OP3_404_87871_20141201_023608_inLine
+BABEL_OP3_404_87871_20141201_023608_outLine
+BABEL_OP3_404_87921_20141201_023029_inLine
+BABEL_OP3_404_87921_20141201_023029_outLine
+BABEL_OP3_404_88873_20141028_190127_inLine
+BABEL_OP3_404_88873_20141028_190127_outLine
+BABEL_OP3_404_89330_20150616_002908_inLine
+BABEL_OP3_404_89330_20150616_002908_outLine
+BABEL_OP3_404_89943_20141105_211847_outLine
+BABEL_OP3_404_90347_20141119_012016_inLine
+BABEL_OP3_404_90347_20141119_012016_outLine
+BABEL_OP3_404_90760_20150611_151739_inLine
+BABEL_OP3_404_90760_20150611_151739_outLine
+BABEL_OP3_404_90832_20150616_012728_inLine
+BABEL_OP3_404_90832_20150616_012728_outLine
+BABEL_OP3_404_91383_20150618_035815_inLine
+BABEL_OP3_404_91475_20150614_034536_inLine
+BABEL_OP3_404_91581_20141129_045608_inLine
+BABEL_OP3_404_91581_20141129_045608_outLine
+BABEL_OP3_404_91581_20141129_050730_inLine
+BABEL_OP3_404_91581_20141129_050730_outLine
+BABEL_OP3_404_91593_20150611_021825_inLine
+BABEL_OP3_404_91593_20150611_021825_outLine
+BABEL_OP3_404_91884_20150503_022858_inLine
+BABEL_OP3_404_91884_20150503_022858_outLine
+BABEL_OP3_404_91888_20150512_191012_inLine
+BABEL_OP3_404_91888_20150512_191012_outLine
+BABEL_OP3_404_91891_20141129_005825_inLine
+BABEL_OP3_404_91891_20141129_005825_outLine
+BABEL_OP3_404_91944_20141022_021002_inLine
+BABEL_OP3_404_91977_20141122_230420_outLine
+BABEL_OP3_404_92176_20141119_195614_inLine
+BABEL_OP3_404_92176_20141119_195614_outLine
+BABEL_OP3_404_92281_20150625_185123_inLine
+BABEL_OP3_404_92757_20150525_200048_inLine
+BABEL_OP3_404_92757_20150525_200048_outLine
+BABEL_OP3_404_92792_20150503_182854_outLine
+BABEL_OP3_404_92792_20150525_025523_outLine
+BABEL_OP3_404_92942_20141120_022830_inLine
+BABEL_OP3_404_92942_20141120_022830_outLine
+BABEL_OP3_404_93007_20150615_051230_inLine
+BABEL_OP3_404_93007_20150615_051230_outLine
+BABEL_OP3_404_93858_20150611_043732_inLine
+BABEL_OP3_404_94002_20141119_015307_inLine
+BABEL_OP3_404_94002_20141119_015307_outLine
+BABEL_OP3_404_94333_20141020_024439_outLine
+BABEL_OP3_404_94487_20150518_005132_outLine
+BABEL_OP3_404_95077_20141201_055702_outLine
+BABEL_OP3_404_95269_20141105_221810_inLine
+BABEL_OP3_404_95269_20141105_221810_outLine
+BABEL_OP3_404_95338_20150610_211203_inLine
+BABEL_OP3_404_95338_20150610_211203_outLine
+BABEL_OP3_404_95399_20141119_001023_inLine
+BABEL_OP3_404_95399_20141119_001023_outLine
+BABEL_OP3_404_95583_20141019_010741_inLine
+BABEL_OP3_404_95583_20141019_010741_outLine
+BABEL_OP3_404_96059_20150524_042224_outLine
+BABEL_OP3_404_96205_20141119_033053_inLine
+BABEL_OP3_404_96205_20141119_033053_outLine
+BABEL_OP3_404_96205_20141119_034909_inLine
+BABEL_OP3_404_96205_20141119_034909_outLine
+BABEL_OP3_404_96247_20150526_202623_outLine
+BABEL_OP3_404_96690_20141117_053054_inLine
+BABEL_OP3_404_96690_20141117_053054_outLine
+BABEL_OP3_404_96808_20150609_034129_inLine
+BABEL_OP3_404_97097_20150601_042649_outLine
+BABEL_OP3_404_97136_20150528_011250_inLine
+BABEL_OP3_404_97136_20150528_011250_outLine
+BABEL_OP3_404_97911_20150613_195820_outLine
+BABEL_OP3_404_98165_20141030_214051_inLine
+BABEL_OP3_404_98165_20141030_214051_outLine
+BABEL_OP3_404_98192_20150617_021906_outLine
+BABEL_OP3_404_98489_20141102_002030_inLine
+BABEL_OP3_404_98489_20141102_004054_inLine
+BABEL_OP3_404_98678_20150528_021605_inLine
+BABEL_OP3_404_98678_20150528_023029_inLine
+BABEL_OP3_404_99289_20150521_220314_inLine
+BABEL_OP3_404_99289_20150521_220314_outLine
+BABEL_OP3_404_99289_20150521_222144_inLine
+BABEL_OP3_404_99289_20150521_222144_outLine
+BABEL_OP3_404_99718_20141019_051850_inLine
+BABEL_OP3_404_99718_20141019_051850_outLine
+BABEL_OP3_404_99718_20141019_053305_inLine
+BABEL_OP3_404_99718_20141019_053305_outLine
+BABEL_OP3_404_99732_20141130_232553_inLine
+BABEL_OP3_404_99732_20141130_232553_outLine
+BABEL_OP3_404_99920_20141022_052026_inLine
diff --git a/egs/babel/s5d/local/arpa2G.sh b/egs/babel/s5d/local/arpa2G.sh
index 40c269fbb22..887b393b459 100755
--- a/egs/babel/s5d/local/arpa2G.sh
+++ b/egs/babel/s5d/local/arpa2G.sh
@@ -85,7 +85,8 @@ if [ ! -z "$oov_prob_file" ]; then
           print "$log10prob $word\n";
        }
      }} print STDERR "Ceilinged $ceilinged unk-probs\n";' \
-       $oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz
+       $oov_prob_file $min_prob $unk_fraction | \
+  ngram  -unk -lm - -write-lm $destdir/lm_tmp.gz
   lmfile=$destdir/lm_tmp.gz
 fi
 
diff --git a/egs/babel/s5d/local/chain/run_blstm.sh b/egs/babel/s5d/local/chain/run_blstm.sh
index 6d13c55fc7d..f098604d04a 100755
--- a/egs/babel/s5d/local/chain/run_blstm.sh
+++ b/egs/babel/s5d/local/chain/run_blstm.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab1.sh b/egs/babel/s5d/local/chain/run_blstm_bab1.sh
index ba8da0e14bc..95c7e9f28aa 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab1.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab1.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab2.sh b/egs/babel/s5d/local/chain/run_blstm_bab2.sh
index f5d698e262c..a6dd4cb9566 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab2.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab2.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab3.sh b/egs/babel/s5d/local/chain/run_blstm_bab3.sh
index 7ad51204c6f..52f085f8942 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab3.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab3.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab4.sh b/egs/babel/s5d/local/chain/run_blstm_bab4.sh
index 72aaeb8778f..47704e80ae4 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab4.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab4.sh
@@ -135,7 +135,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab5.sh b/egs/babel/s5d/local/chain/run_blstm_bab5.sh
index 1bae225022e..73c6a4089ed 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab5.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab5.sh
@@ -135,7 +135,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_xconfig.sh b/egs/babel/s5d/local/chain/run_blstm_xconfig.sh
new file mode 100755
index 00000000000..27e1a571ad0
--- /dev/null
+++ b/egs/babel/s5d/local/chain/run_blstm_xconfig.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+
+# by default, with cleanup:
+# local/chain/run_blstm.sh
+# %WER 46.8 | 19252 60586 | 57.6 28.5 13.8 4.5 46.8 31.7 | -0.643 | exp/chain_cleaned/blstm_sp_bi/decode_dev10h.pem/score_8/penalty_0.25/dev10h.pem.ctm.sys
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri5_cleaned  # the gmm for the target data
+langdir=data/langp/tri5_ali
+num_threads_ubm=12
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+blstm_affix=_xconfig  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/blstm_sp_bi/egs  # you can set this to use previously dumped egs.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chain/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/blstm${blstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r $langdir data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    $langdir $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  #echo "$0: creating neural net configs";
+  #steps/nnet3/lstm/make_configs.py  \
+  #  --self-repair-scale-nonlinearity 0.00001 \
+  #  --self-repair-scale-clipgradient 1.0 \
+  # $dir/configs || exit 1;
+  echo "$0: creating neural net configs using the xconfig parser";
+
+	label_delay=0
+  xent_regularize=0.1
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+	
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=blstm1-forward input=lda cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  lstmp-layer name=blstm1-backward input=lda cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=3
+  lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=3
+  lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=3
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
+  fi
+  [ ! -d $dir/egs ] && mkdir -p $dir/egs/
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+fi
+
+exit 0
diff --git a/egs/babel/s5d/local/chain/run_ivector_common.sh b/egs/babel/s5d/local/chain/run_ivector_common.sh
index 7354d59465b..696fd14b45f 100755
--- a/egs/babel/s5d/local/chain/run_ivector_common.sh
+++ b/egs/babel/s5d/local/chain/run_ivector_common.sh
@@ -71,7 +71,8 @@ if [ $stage -le 2 ]; then
   utils/copy_data_dir.sh data/${train_set}_sp data/${train_set}_sp_hires
   mfccdir=data/${train_set}_sp_hires/data
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
 
   # do volume-perturbation on the training data prior to extracting hires
@@ -171,7 +172,8 @@ if [ $stage -le 7 ]; then
   # valid for the non-'max2' data, the utterance list is the same.
   ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
-    utils/create_split_dir.pl /export/b{15,16,17,18}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+    utils/create_split_dir.pl \
+      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$ivectordir/storage $ivectordir/storage
   fi
   # We extract iVectors on the speed-perturbed training data after combining
   # short segments, which will be what we train the system on.  With
diff --git a/egs/babel/s5d/local/chain/run_tdnn.sh b/egs/babel/s5d/local/chain/run_tdnn.sh
index 3ce53fa9292..2d9b6db75b7 100755
--- a/egs/babel/s5d/local/chain/run_tdnn.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab1.sh b/egs/babel/s5d/local/chain/run_tdnn_bab1.sh
index db82c0f358a..0fa4020977c 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab1.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab1.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab2.sh b/egs/babel/s5d/local/chain/run_tdnn_bab2.sh
index 51387901683..ea9d5959c75 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab2.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab2.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab3.sh b/egs/babel/s5d/local/chain/run_tdnn_bab3.sh
index 098c3de0482..2973a2c9f02 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab3.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab3.sh
@@ -3,7 +3,6 @@
 
 # by default, with cleanup:
 # local/chain/run_tdnn.sh
-
 # %WER 46.7 | 19252 60586 | 57.4 26.4 16.2 4.0 46.7 31.6 | -0.469 | exp/chain_cleaned/tdnnbab3_sp_bi/decode_dev10h.pem/score_9/penalty_0.0/dev10h.pem.ctm.sys
 
 set -e -o pipefail
@@ -134,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab4.sh b/egs/babel/s5d/local/chain/run_tdnn_bab4.sh
index 5831cfc28f0..bd2eba9cb8b 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab4.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab4.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh b/egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh
new file mode 100755
index 00000000000..ec8366492d7
--- /dev/null
+++ b/egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+
+# From egs/swbdrun_tdnn_lstm_1e.sh
+
+set -e -o pipefail -u
+
+# configs for 'chain'
+stage=0
+nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri5_cleaned  # the gmm for the target data
+langdir=data/langp/tri5_ali
+num_threads_ubm=12
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+blstm_affix=bab1  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain_cleaned/blstm_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/blstm${blstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
+  fi
+  [ ! -d $dir/egs ] && mkdir -p $dir/egs/
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    #--trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 6 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+exit 0
diff --git a/egs/babel/s5d/local/check_tools.sh b/egs/babel/s5d/local/check_tools.sh
index ca8800def41..2c96f8445d1 100755
--- a/egs/babel/s5d/local/check_tools.sh
+++ b/egs/babel/s5d/local/check_tools.sh
@@ -18,20 +18,20 @@
 [ -f ./path.sh ] && . ./path.sh
 
 sph2pipe=`command -v sph2pipe 2>/dev/null` \
-  || { echo  >&2 "sph2pipe not found on PATH. Did you run make in the $KALDI_ROOT/tools directory?"; return 1; }
+  || { echo  >&2 "sph2pipe not found on PATH. Did you run make in the $KALDI_ROOT/tools directory?"; exit 1; }
 
 srilm=`command -v ngram 2>/dev/null` \
-  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh"; return 1; }
+  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh"; exit 1; }
 
 sox=`command -v sox 2>/dev/null` \
-  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; return 1; }
+  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }
 
 # If sox is found on path, check if the version is correct
 if [ ! -z "$sox" ]; then
   sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
   if [[ ! $sox_version =~ v14.4.* ]]; then
     echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
-    return 1
+    exit 1
   fi
 fi
 
diff --git a/egs/babel/s5d/local/extend_lexicon.sh b/egs/babel/s5d/local/extend_lexicon.sh
index c930b1729e0..41b244f110b 100755
--- a/egs/babel/s5d/local/extend_lexicon.sh
+++ b/egs/babel/s5d/local/extend_lexicon.sh
@@ -148,20 +148,10 @@ cp $input_lexicon $toplevel_dir/input_lexicon.txt  # just to have a record of wh
 
 loc=`which ngram-count`;
 if [ -z $loc ]; then
-  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
-    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
-  else
-    sdir=`pwd`/../../../tools/srilm/bin/i686
-  fi
-  if [ -f $sdir/ngram-count ]; then
-    echo Using SRILM tools from $sdir
-    export PATH=$PATH:$sdir
-  else
-    echo You appear to not have SRILM tools installed, either on your path,
-    echo or installed in $sdir.  See tools/install_srilm.sh for installation
-    echo instructions.
-    exit 1
-  fi
+  echo You appear to not have SRILM tools installed, either on your path,
+  echo or installed in $sdir.  See tools/install_srilm.sh for installation
+  echo instructions.
+  exit 1
 fi
 
 
@@ -231,10 +221,9 @@ if [ $stage -le -3 ]; then
 
   echo "$0: using SRILM to train syllable LM"
 
-  ngram-count -lm $dir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $dir/syllable_text.txt -sort
-
+  ngram-count -lm $dir/3gram.me.gz -maxent -maxent-convert-to-arpa  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $dir/syllable_text.txt -sort
   rm $dir/lm.gz 2>/dev/null
-  ln -s 3gram.kn022.gz $dir/lm.gz
+  ln -s 3gram.me.gz $dir/lm.gz
 fi
 
 
diff --git a/egs/babel/s5d/local/generate_confusion_matrix.sh b/egs/babel/s5d/local/generate_confusion_matrix.sh
index 48263e729de..fb602cf0957 100755
--- a/egs/babel/s5d/local/generate_confusion_matrix.sh
+++ b/egs/babel/s5d/local/generate_confusion_matrix.sh
@@ -61,7 +61,7 @@ fi
 mkdir -p $wdir/log
 
 cat $data/phones.txt | sed 's/_[B|E|I|S]//g' |\
-  sed 's/_[%|"]//g' | sed 's/_[0-9]\+//g' > $wdir/phones.txt
+  sed 's/_[%|"]//g' | sed 's/_[0-9]\+//g' | sed 's/_[^ ]*//g' > $wdir/phones.txt
 
 echo "Converting alignments to phone sequences..."
 $cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \
@@ -81,7 +81,8 @@ for i in `seq 1 $nj` ; do
 done
 
 echo "Converting statistics..."
-cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g'| sort | uniq -c | \
+cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g' | \
+  sed 's/ *$//g' | sed 's/^ *//g' | sort | uniq -c | \
   grep -v -E '<oov>|<sss>|<vns>|SIL' | \
   perl -ane '
     die unless scalar @F == 3;
diff --git a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
index b6d4b9ab944..3670ba755bc 100755
--- a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
+++ b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
@@ -27,7 +27,7 @@ def main():
     unicode_transcription = baseform2unicode(baseforms)
     encoded_transcription, table = encode(unicode_transcription,
                                           args.tag_percentage,
-                                          log=args.verbose)
+                                          log=args.log)
     write_table(table, args.lex_out)
     
     # Extract dictionary of nonspeech pronunciations
@@ -59,7 +59,7 @@ def parse_input():
         Parse commandline input.
     '''
     if len(sys.argv[1:]) == 0:
-        print("Usage: ./make_unicode_lexicon.py [opts] lex_in lex_out")
+        print("Usage: ./make_unicode_lexicon.py [opts] lex_in lex_out [log]")
         sys.exit(1)
 
     parser = argparse.ArgumentParser()
@@ -67,7 +67,9 @@ def parse_input():
                         "paired with a baseform. 1 word per line with the "
                         "baseform separated by a tab")
     parser.add_argument("lex_out", help="Path of output output "
-                        "graphemc lexicon")
+                        "graphemic lexicon")
+    parser.add_argument("log", nargs='?', default=None,
+                        help="Directory in which the logs will be stored");
     parser.add_argument("-F", "--fmt", help="Format of input word list",
                         action="store", default="word_list")
     parser.add_argument("-T", "--tag_percentage", help="Percentage of least"
@@ -246,12 +248,11 @@ def encode(unicode_transcription, tag_percentage, log=False):
     graph_counts = graph_counts_dict
   
     # Print grapheme counts to histogram
-    if log:
+    if log is not None:
         graph_counts_sorted = sorted(graph_counts, reverse=True,
                                      key=graph_counts.get)
-        if not os.path.exists("lex_log"):
-            os.makedirs("lex_log")
-        with codecs.open("lex_log/grapheme_histogram.txt", "w", "utf-8") as fp:
+        logfile = "{}/grapheme_histogram.txt".format(log)
+        with codecs.open(logfile, "w", "utf-8") as fp:
             fp.write("Graphemes (Count Threshold = %.6f)\n" % count_thresh)
             for g in graph_counts_sorted:
                 weight = ("-" * int(np.ceil(500.0 * graph_counts[g])) +
diff --git a/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh b/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh
index 2ffb73810e3..be6aa5c2b40 100755
--- a/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh
+++ b/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh
@@ -4,15 +4,16 @@
 
 # Begin configuration section.
 language="201-haitian"
+corpus=/export/babel/data/
+indus=/export/babel/data/scoring/IndusDB
 # End configuration section
 . ./utils/parse_options.sh
 
 set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 
-corpus=/export/babel/data/$language
+corpus=$corpus/$language
 lists=./conf/lists/$language/
-indus=/export/babel/data/scoring/IndusDB
 
 corpusdir=$(find $corpus -maxdepth 1 -name "*-build" -type d) || exit 1
 [ -z "$corpusdir" ] && "Corpus directory for $language not found!" && exit 1
diff --git a/egs/babel/s5d/local/nnet3/run_blstm.sh b/egs/babel/s5d/local/nnet3/run_blstm.sh
index 6833baa0d72..fcf7fb8947d 100755
--- a/egs/babel/s5d/local/nnet3/run_blstm.sh
+++ b/egs/babel/s5d/local/nnet3/run_blstm.sh
@@ -5,7 +5,7 @@ cell_dim=512
 rp_dim=128
 nrp_dim=128
 affix=bidirectional
-multicondition=true
+multicondition=false
 common_egs_dir=
 num_epochs=8
 
diff --git a/egs/babel/s5d/local/nnet3/run_ivector_common.sh b/egs/babel/s5d/local/nnet3/run_ivector_common.sh
index bfe66d13f76..7313230a7ee 100755
--- a/egs/babel/s5d/local/nnet3/run_ivector_common.sh
+++ b/egs/babel/s5d/local/nnet3/run_ivector_common.sh
@@ -60,8 +60,8 @@ fi
 if [ $stage -le 3 ]; then
   mfccdir=mfcc_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    date=$(date +'%m_%d_%H_%M')
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
 
   # the 100k_nodup directory is copied seperately, as
diff --git a/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh b/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh
index 8d3973e65bc..c3a6e1c0952 100755
--- a/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh
+++ b/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh
@@ -70,8 +70,8 @@ fi
 if [ $stage -le 3 ]; then
   mfccdir=mfcc_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    date=$(date +'%m_%d_%H_%M')
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/kaldi-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
 
   # the 100k_nodup directory is copied seperately, as
@@ -151,8 +151,8 @@ train_set=train_sp_mc
 if [ $stage -le 7 ]; then
   mfccdir=mfcc_reverb
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    date=$(date +'%m_%d_%H_%M')
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/babel_reverb-$date/s5/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/babel_reverb-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
   for data_dir in $train_set; do
     utils/copy_data_dir.sh data/$data_dir data/${data_dir}_hires
diff --git a/egs/babel/s5d/local/nnet3/run_lstm.sh b/egs/babel/s5d/local/nnet3/run_lstm.sh
index 8105cfda387..f7d06501569 100755
--- a/egs/babel/s5d/local/nnet3/run_lstm.sh
+++ b/egs/babel/s5d/local/nnet3/run_lstm.sh
@@ -121,7 +121,7 @@ fi
 if [ $stage -le 13 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/train_rnn.py --stage=$train_stage \
@@ -136,7 +136,6 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.final-effective-lrate=$final_effective_lrate \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
-    --trainer.optimization.cv-minibatch-size 128 \
     --trainer.optimization.momentum=$momentum \
     --egs.chunk-width=$chunk_width \
     --egs.chunk-left-context=$chunk_left_context \
diff --git a/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh b/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh
index acd65e9114e..2448b1b17ff 100755
--- a/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh
+++ b/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh
@@ -114,7 +114,7 @@ fi
 if [ $stage -le 3 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/train_rnn.py --stage=$train_stage \
diff --git a/egs/babel/s5d/local/nnet3/run_tdnn.sh b/egs/babel/s5d/local/nnet3/run_tdnn.sh
index 8899e363dd9..2a663486bcb 100755
--- a/egs/babel/s5d/local/nnet3/run_tdnn.sh
+++ b/egs/babel/s5d/local/nnet3/run_tdnn.sh
@@ -60,7 +60,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 if [ $stage -le 9 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/train_tdnn.sh --stage $train_stage \
diff --git a/egs/babel/s5d/local/reestimate_langp.sh b/egs/babel/s5d/local/reestimate_langp.sh
index 059fba52043..ae70b6a8f46 100755
--- a/egs/babel/s5d/local/reestimate_langp.sh
+++ b/egs/babel/s5d/local/reestimate_langp.sh
@@ -29,5 +29,6 @@ utils/dict_dir_add_pronprobs.sh --max-normalize true $idict  \
   $amdir/pron_bigram_counts_nowb.txt $odict
 
 utils/prepare_lang.sh  --phone-symbol-table $langdir/phones.txt \
+  --share-silence-phones true \
   $odict "$unk" $olocallang $olang
 
diff --git a/egs/babel/s5d/local/run_kws_stt_task2.sh b/egs/babel/s5d/local/run_kws_stt_task2.sh
index 6007baa1756..9c10bfe6da5 100755
--- a/egs/babel/s5d/local/run_kws_stt_task2.sh
+++ b/egs/babel/s5d/local/run_kws_stt_task2.sh
@@ -71,14 +71,26 @@ fi
 if ! $skip_kws ; then
   [ ! -f $data_dir/extra_kws_tasks ] && exit 0
 
-  syll_data_dir=$(echo $data_dir | perl -pe 's/\.(pem|seg)$/.syll.$1/g' )
+  idata=$(basename $data_dir)
+  idir=$(dirname $data_dir)
+
+  idataset=${idata%%.*}
+  idatatype=${idata#*.}
+
+  if [ "$idata" == "$idataset" ]; then
+    syll_data_dir=$idir/${idataset}.syll
+    phn_data_dir=$idir/${idataset}.phn
+  else
+    syll_data_dir=$idir/${idataset}.syll.${idatatype}
+    phn_data_dir=$idir/${idataset}.phn.${idatatype}
+  fi
+
   if [ -d ${syll_data_dir} ] && [ ! -f ${decode_dir}/syllabs/.done ] ; then
     local/syllab/lattice_word2syll.sh --cmd "$cmd --mem 8G" \
       $data_dir $lang_dir ${lang_dir}.syll $decode_dir ${decode_dir}/syllabs
     touch ${decode_dir}/syllabs/.done
   fi
 
-  phn_data_dir=$(echo $data_dir | perl -pe 's/\.(pem|seg)$/.phn.$1/g' )
   if [ -d ${phn_data_dir} ] && [ ! -f ${decode_dir}/phones/.done ] ; then
     local/syllab/lattice_word2syll.sh --cmd "$cmd --mem 8G" \
       $data_dir $lang_dir ${lang_dir}.phn $decode_dir ${decode_dir}/phones
diff --git a/egs/babel/s5d/local/search/run_phn_search.sh b/egs/babel/s5d/local/search/run_phn_search.sh
index 44587699a38..e4dba529b3d 100755
--- a/egs/babel/s5d/local/search/run_phn_search.sh
+++ b/egs/babel/s5d/local/search/run_phn_search.sh
@@ -29,7 +29,11 @@ dataset=${dir%%.*}
 datatype=${dir#*.}
 
 lang=data/lang.phn
-data=data/${dataset}.phn.${datatype}
+if [ "$dir" == "$dataset" ]; then
+  data=data/${dataset}.phn
+else
+  data=data/${dataset}.phn.${datatype}
+fi
 
 set +o nounset
 eval kwsets=${!dataset_kwlists[@]}
@@ -76,7 +80,7 @@ if [ $stage -le 2 ] ; then
       ${data}/kwset_${set}/tmp.4
 
     # and finally, replace the categories by the word-level categories
-    cp data/$dir/kwset_${set}/categories $data/kwset_${set}/categories
+    cp data/${dir}/kwset_${set}/categories $data/kwset_${set}/categories
   done
 fi
 
diff --git a/egs/babel/s5d/local/search/run_search.sh b/egs/babel/s5d/local/search/run_search.sh
index 2cb40cabb59..1fbdb071123 100755
--- a/egs/babel/s5d/local/search/run_search.sh
+++ b/egs/babel/s5d/local/search/run_search.sh
@@ -67,8 +67,11 @@ if [ $stage -le 2 ] ; then
     #--   data/dev10h.pem/${set}_oov_kws/tmp/L1.lex data/dev10h.pem/kwset_${set}/tmp.3
     if [ -d data/local/extend ]; then
       echo "Detected extended lexicon system..."
-      local/search/compile_proxy_keywords.sh --cmd "$decode_cmd --mem 12G" --filter "OOV=1&&Characters>2"\
-        --beam 5 --nbest 50 --nj 64 --phone-beam 5 --phone-nbest 300  --confusion-matrix exp/conf_matrix/confusions.txt  \
+      local/search/compile_proxy_keywords.sh --filter "OOV=1&&Characters>2"\
+        --cmd "$decode_cmd --mem 24G --max-jobs-run 64" --nj 128 \
+        --beam $extlex_proxy_beam --nbest $extlex_proxy_nbest \
+        --phone-beam $extlex_proxy_phone_beam --phone-nbest $extlex_proxy_phone_nbest\
+        --confusion-matrix exp/conf_matrix/confusions.txt  \
         data/$dir/kwset_${set} data/lang data/local/lexiconp.txt exp/g2p \
         data/$dir/kwset_${set}/tmp.4
     else
diff --git a/egs/babel/s5d/local/search/run_syll_search.sh b/egs/babel/s5d/local/search/run_syll_search.sh
index eb48d836e77..41a925ce13a 100755
--- a/egs/babel/s5d/local/search/run_syll_search.sh
+++ b/egs/babel/s5d/local/search/run_syll_search.sh
@@ -29,7 +29,11 @@ dataset=${dir%%.*}
 datatype=${dir#*.}
 
 lang=data/lang.syll
-data=data/${dataset}.syll.${datatype}
+if [ "$dir" == "$dataset" ]; then
+  data=data/${dataset}.syll
+else
+  data=data/${dataset}.syll.${datatype}
+fi
 
 set +o nounset
 eval kwsets=${!dataset_kwlists[@]}
diff --git a/egs/babel/s5d/local/search/search.sh b/egs/babel/s5d/local/search/search.sh
index 200a49d8e86..6a5b2d35a97 100755
--- a/egs/babel/s5d/local/search/search.sh
+++ b/egs/babel/s5d/local/search/search.sh
@@ -26,6 +26,7 @@ silence_word=  # specify this if you did to in kws_setup.sh, it's more accurate.
 strict=false
 duptime=0.6
 ntrue_scale=1.0
+frame_subsampling_factor=1
 nbest=-1
 max_silence_frames=50
 # End configuration section.
diff --git a/egs/babel/s5d/local/syllab/lattice_word2syll.sh b/egs/babel/s5d/local/syllab/lattice_word2syll.sh
index b81bf9d18d4..63e9114875d 100755
--- a/egs/babel/s5d/local/syllab/lattice_word2syll.sh
+++ b/egs/babel/s5d/local/syllab/lattice_word2syll.sh
@@ -26,7 +26,7 @@ mkdir -p $output/log
 
 if [ -f $olang/lex.words2syllabs.fst ] ; then
   fstinvert $olang/lex.words2syllabs.fst | fstreverse | \
-    fstminimize | fstreverse > $output/L.fst
+    fstminimize --allow_nondet | fstreverse > $output/L.fst
 
   $cmd JOB=1:$nj $output/log/convert.JOB.log \
     lattice-push --push-strings ark:"gunzip -c $input/lat.JOB.gz|" ark:- \| \
diff --git a/egs/babel/s5d/local/syllab/run_phones.sh b/egs/babel/s5d/local/syllab/run_phones.sh
index 6f3c7be4cef..7c4a13c61f9 100755
--- a/egs/babel/s5d/local/syllab/run_phones.sh
+++ b/egs/babel/s5d/local/syllab/run_phones.sh
@@ -21,10 +21,20 @@ if [ $# -ne 1 ] ; then
 fi
 
 idir=$1
+
+if [ ! -d "$idir" ] ; then
+  echo "The directory $idir does not exist"
+  exit 1
+fi
+
 idata=${idir##*/}
 
 
-odata=${idata%%.*}.phn.${idata#*.}
+if [ "$idata" == ${idata%%.*} ]; then
+  odata=${idata%%.*}.phn
+else
+  odata=${idata%%.*}.phn.${idata#*.}
+fi
 
 if [ $stage -le -1 ] ; then
   local/syllab/generate_phone_lang.sh \
diff --git a/egs/babel/s5d/local/syllab/run_syllabs.sh b/egs/babel/s5d/local/syllab/run_syllabs.sh
index a2ec82f3033..7366ac9ad35 100755
--- a/egs/babel/s5d/local/syllab/run_syllabs.sh
+++ b/egs/babel/s5d/local/syllab/run_syllabs.sh
@@ -21,10 +21,19 @@ if [ $# -ne 1 ] ; then
 fi
 
 idir=$1
-idata=${idir##*/}
 
+if [ ! -d "$idir" ] ; then
+  echo "The directory $idir does not exist"
+  exit 1
+fi
+
+idata=${idir##*/}
 
-odata=${idata%%.*}.syll.${idata#*.}
+if [ "$idata" == ${idata%%.*} ]; then
+  odata=${idata%%.*}.syll
+else
+  odata=${idata%%.*}.syll.${idata#*.}
+fi
 
 if [ $stage -le -1 ] ; then
   local/syllab/generate_syllable_lang.sh \
@@ -45,7 +54,7 @@ if [ $stage -le -1 ] ; then
   local/arpa2G.sh  data/srilm.syll/lm.gz  data/lang.syll/ data/lang.syll/
 fi
 
-if [ $stage -le 0 ] && [ -f "$idir/text" ] ; then
+if [ $stage -le 0 ] && [ -f "$idir/text" ]; then
   #Create dev10h.syll.pem dir
   steps/align_fmllr.sh \
       --boost-silence $boost_sil --nj $train_nj --cmd "$train_cmd" \
diff --git a/egs/babel/s5d/run-1-main-unicode-extend-lex.sh b/egs/babel/s5d/run-1-main-unicode-extend-lex.sh
new file mode 100755
index 00000000000..f9de3e8e947
--- /dev/null
+++ b/egs/babel/s5d/run-1-main-unicode-extend-lex.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+
+# Parameters for extended lexicon.
+extend_lexicon=true
+unk_fraction_boost=1.0
+num_sent_gen=12000000
+num_prons=1000000
+morfessor=true
+tag_percentage=0.1
+denlats_only=false
+
+[ ! -f ./lang.conf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1
+[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1
+
+. conf/common_vars.sh || exit 1;
+. ./lang.conf || exit 1;
+
+[ -f local.conf ] && . ./local.conf
+
+. ./utils/parse_options.sh
+
+set -e           #Exit on non-zero return code from any command
+set -o pipefail  #Exit if any of the commands in the pipeline will
+                 #return non-zero return code
+#set -u           #Fail on an undefined variable
+
+lexicon=data/local/lexicon.txt
+if $extend_lexicon; then
+  lexicon=data/local/lexiconp.txt
+fi
+
+./local/check_tools.sh || exit 1
+
+#Preparing dev2h and train directories
+if [ ! -f data/raw_train_data/.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Subsetting the TRAIN set"
+    echo ---------------------------------------------------------------------
+
+    local/make_corpus_subset.sh "$train_data_dir" "$train_data_list" ./data/raw_train_data
+    train_data_dir=`readlink -f ./data/raw_train_data`
+    touch data/raw_train_data/.done
+fi
+nj_max=`cat $train_data_list | wc -l`
+if [[ "$nj_max" -lt "$train_nj" ]] ; then
+    echo "The maximum reasonable number of jobs is $nj_max (you have $train_nj)! (The training and decoding process has file-granularity)"
+    exit 1;
+    train_nj=$nj_max
+fi
+train_data_dir=`readlink -f ./data/raw_train_data`
+
+if [ ! -d data/raw_dev2h_data ]; then
+  echo ---------------------------------------------------------------------
+  echo "Subsetting the DEV2H set"
+  echo ---------------------------------------------------------------------
+  local/make_corpus_subset.sh "$dev2h_data_dir" "$dev2h_data_list" ./data/raw_dev2h_data || exit 1
+fi
+
+if [ ! -d data/raw_dev10h_data ]; then
+  echo ---------------------------------------------------------------------
+  echo "Subsetting the DEV10H set"
+  echo ---------------------------------------------------------------------
+  local/make_corpus_subset.sh "$dev10h_data_dir" "$dev10h_data_list" ./data/raw_dev10h_data || exit 1
+fi
+
+# Move data/dev2h preparation forward so we can get data/dev2h/text for
+# diagnostic purpose when extending the lexicon.
+if [[ ! -f data/dev2h/wav.scp || data/dev2h/wav.scp -ot ./data/raw_dev2h_data/audio ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing dev2h data lists in data/dev2h on" `date`
+  echo ---------------------------------------------------------------------
+  mkdir -p data/dev2h
+  local/prepare_acoustic_training_data.pl \
+    --fragmentMarkers \-\*\~ \
+    `pwd`/data/raw_dev2h_data data/dev2h > data/dev2h/skipped_utts.log || exit 1
+fi
+
+if [[ ! -f data/dev2h/glm || data/dev2h/glm -ot "$glmFile" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing dev2h stm files in data/dev2h on" `date`
+  echo ---------------------------------------------------------------------
+  if [ -z $dev2h_stm_file ]; then
+    echo "WARNING: You should define the variable stm_file pointing to the IndusDB stm"
+    echo "WARNING: Doing that, it will give you scoring close to the NIST scoring.    "
+    local/prepare_stm.pl --fragmentMarkers \-\*\~ data/dev2h || exit 1
+  else
+    local/augment_original_stm.pl $dev2h_stm_file data/dev2h || exit 1
+  fi
+  [ ! -z $glmFile ] && cp $glmFile data/dev2h/glm
+
+fi
+
+mkdir -p data/local
+if [[ ! -f $lexicon || $lexicon -ot "$lexicon_file" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing lexicon in data/local on" `date`
+  echo ---------------------------------------------------------------------
+
+  local/lexicon/make_word_list.py $train_data_dir/filelist.list $train_data_dir/transcription data/local/word_list.txt
+  echo -e "<silence> SIL\n<unk> <oov>\n<noise> <sss>\n<v-noise> <vns>" > data/local/nonspeech.txt
+  echo -e "<hes> <hes>" > data/local/extraspeech.txt
+
+  fmt="word_list"
+  if $morfessor; then
+    fmt="morfessor"
+    morfessor-train --encoding=utf_8 --traindata-list -f"-_" -s data/local/morfessor.bin \
+      data/local/word_list.txt
+    morfessor-segment --encoding=utf_8 --output-format-separator '.' --viterbi-maxlen 3 \
+      -l data/local/morfessor.bin <(cut -d' ' -f2 data/local/word_list.txt) \
+      | sed 's/\.[\_\-]\././g' > data/local/segments
+    cut -d' ' data/local/word_list.txt -f2 | paste -d' ' - data/local/segments > data/local/word_list_tmp.txt
+    mv data/local/word_list_tmp.txt data/local/word_list.txt
+  fi
+
+  local/lexicon/make_unicode_lexicon.py --tag_percentage $tag_percentage --fmt $fmt \
+    --nonspeech data/local/nonspeech.txt --extraspeech data/local/extraspeech.txt \
+    --verbose data/local/word_list.txt data/local/lexicon.txt data/local/
+  local/prepare_unicode_lexicon.py --nonspeech data/local/nonspeech.txt \
+    --extraspeech data/local/extraspeech.txt data/local/lexicon_table.txt data/local
+  cp data/local/lexicon.txt data/local/filtered_lexicon.txt
+  if $extend_lexicon; then
+    # Extend the original lexicon.
+    # Will creates the files data/local/extend/{lexiconp.txt,oov2prob}.
+    mv data/local/lexicon.txt  data/local/lexicon_orig.txt
+    local/extend_lexicon.sh --cmd "$train_cmd" --cleanup false \
+      --num-sent-gen $num_sent_gen --num-prons $num_prons \
+      data/local/lexicon_orig.txt data/local/extend data/dev2h/text
+    cp data/local/extend/lexiconp.txt data/local/
+  fi
+fi
+
+mkdir -p data/lang
+if [[ ! -f data/lang/L.fst || data/lang/L.fst -ot $lexicon ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Creating L.fst etc in data/lang on" `date`
+  echo ---------------------------------------------------------------------
+  utils/prepare_lang.sh \
+    --share-silence-phones true \
+    data/local $oovSymbol data/local/tmp.lang data/lang
+fi
+
+if [[ ! -f data/train/wav.scp || data/train/wav.scp -ot "$train_data_dir" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing acoustic training lists in data/train on" `date`
+  echo ---------------------------------------------------------------------
+  mkdir -p data/train
+  local/prepare_acoustic_training_data.pl \
+    --vocab $lexicon --fragmentMarkers \-\*\~ \
+    $train_data_dir data/train > data/train/skipped_utts.log
+fi
+
+if [[ ! -f data/srilm/lm.gz || data/srilm/lm.gz -ot data/train/text ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Training SRILM language models on" `date`
+  echo ---------------------------------------------------------------------
+  # If extending the lexicon, use "--words-file data/local/lexicon_orig.txt" so
+  # that the LM is trained just on the vocab that appears in the text. Will add
+  # in the OOVs later.
+  words_file_param=()
+  if $extend_lexicon; then
+    words_file_param=(--words-file data/local/lexicon_orig.txt)
+  fi
+  local/train_lms_srilm.sh  --oov-symbol "$oovSymbol"\
+    "${words_file_param[@]}" \
+    --train-text data/train/text data data/srilm
+fi
+
+if [[ ! -f data/lang/G.fst || data/lang/G.fst -ot data/srilm/lm.gz ||\
+  ( -f data/local/extend/oov2prob &&\
+  data/lang/G.fst -ot data/local/extend/oov2prob ) ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Creating G.fst on " `date`
+  echo ---------------------------------------------------------------------
+  extend_lexicon_param=()
+  if $extend_lexicon; then
+    [ -f data/local/extend/original_oov_rates ] || exit 1;
+    unk_fraction=`cat data/local/extend/original_oov_rates |\
+      grep "token" | awk -v x=$unk_fraction_boost '{print $NF/100.0*x}'`
+    extend_lexicon_param=(--cleanup false --unk-fraction $unk_fraction \
+      --oov-prob-file data/local/extend/oov2prob)
+  fi
+  local/arpa2G.sh ${extend_lexicon_param[@]} \
+    data/srilm/lm.gz data/lang data/lang
+fi
+
+echo ---------------------------------------------------------------------
+echo "Starting plp feature extraction for data/train in plp on" `date`
+echo ---------------------------------------------------------------------
+
+if [ ! -f data/train/.plp.done ]; then
+  if $use_pitch; then
+    steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $train_nj data/train exp/make_plp_pitch/train plp
+  else
+    steps/make_plp.sh --cmd "$train_cmd" --nj $train_nj data/train exp/make_plp/train plp
+  fi
+  utils/fix_data_dir.sh data/train
+  steps/compute_cmvn_stats.sh data/train exp/make_plp/train plp
+  utils/fix_data_dir.sh data/train
+  touch data/train/.plp.done
+fi
+
+touch data/.extlex
+mkdir -p exp
+
+echo -------------------------------------------------------------------------
+echo "Extended lexicon finished on" `date`. Now running script run-1-main.sh
+echo -------------------------------------------------------------------------
+./run-1-main-unicode.sh --denlats-only "$denlats_only"
+exit 0
diff --git a/egs/babel/s5d/run-1-main-unicode.sh b/egs/babel/s5d/run-1-main-unicode.sh
index e3fb2486c83..acd2693cbef 100755
--- a/egs/babel/s5d/run-1-main-unicode.sh
+++ b/egs/babel/s5d/run-1-main-unicode.sh
@@ -80,7 +80,7 @@ if [[ ! -f $lexicon || $lexicon -ot "$lexicon_file" ]]; then
 
   local/lexicon/make_unicode_lexicon.py --tag_percentage $tag_percentage --fmt $fmt \
     --nonspeech data/local/nonspeech.txt --extraspeech data/local/extraspeech.txt \
-    --verbose data/local/word_list.txt data/local/lexicon.txt
+    --verbose data/local/word_list.txt data/local/lexicon.txt data/local/
   local/prepare_unicode_lexicon.py --nonspeech data/local/nonspeech.txt \
     --extraspeech data/local/extraspeech.txt data/local/lexicon_table.txt data/local
   cp data/local/lexicon.txt data/local/filtered_lexicon.txt
diff --git a/egs/babel/s5d/run-4-anydecode.sh b/egs/babel/s5d/run-4-anydecode.sh
index 083ac7e9879..8ac0fde2621 100755
--- a/egs/babel/s5d/run-4-anydecode.sh
+++ b/egs/babel/s5d/run-4-anydecode.sh
@@ -26,7 +26,7 @@ extra_left_context=40
 extra_right_context=40
 frames_per_chunk=20
 
-echo "run-4-test.sh $@"
+echo "$0 $@"
 
 . utils/parse_options.sh
 
@@ -61,7 +61,9 @@ dataset_type=${dir%%.*}
 #By default, we want the script to accept how the dataset should be handled,
 #i.e. of  what kind is the dataset
 if [ -z ${kind} ] ; then
-  if [ "$dataset_type" == "dev2h" ] || [ "$dataset_type" == "dev10h" ]; then
+  if [ "$dataset_type" == "dev2h" ] || \
+    [ "$dataset_type" == "dev10h" ] || \
+    [ "$dataset_type" == "train" ]; then
     dataset_kind=supervised
   else
     dataset_kind=unsupervised
@@ -96,11 +98,24 @@ if [ -z $my_data_dir ] || [ -z $my_data_list ] ; then
   exit 1
 fi
 
+if [ "$dataset_type" == "train" ] ;  then
+  local/ali_to_rttm.sh --cmd "$decode_cmd" data/train  data/langp_test exp/tri5_ali
+  bash -x  local/qbe/wav_to_ecf.sh  data/train/wav.scp > data/train/ecf.train.xml
+  train_rttm_file=./exp/tri5_ali/rttm
+  train_ecf_file=./data/train/ecf.train.xml
+fi
+
+
 eval my_stm_file=\$${dataset_type}_stm_file
 eval my_ecf_file=\$${dataset_type}_ecf_file
 eval my_rttm_file=\$${dataset_type}_rttm_file
 eval my_nj=\$${dataset_type}_nj  #for shadow, this will be re-set when appropriate
 
+echo "my_stm_file=$my_stm_file"
+echo "my_ecf_file=$my_ecf_file"
+echo "my_rttm_file=$my_rttm_file"
+echo "my_nj=$my_nj"
+
 if [ -z "$my_nj" ]; then
   echo >&2 "You didn't specify the number of jobs -- variable \"${dataset_type}_nj\" not defined."
   exit 1
@@ -214,7 +229,8 @@ if [ ! -f  $dataset_dir/.done ] ; then
       . ./local/datasets/supervised_seg.sh || exit 1
     elif [ "$dataset_segments" == "uem" ]; then
       . ./local/datasets/supervised_uem.sh || exit 1
-    elif [ "$dataset_segments" == "pem" ]; then
+    elif [ "$dataset_segments" == "train" ] ||\
+         [ "$dataset_segments" == "pem" ]; then
       . ./local/datasets/supervised_pem.sh || exit 1
     else
       echo "Unknown type of the dataset: \"$dataset_segments\"!";
@@ -294,29 +310,31 @@ echo ---------------------------------------------------------------------
 echo "Preparing kws data files in ${dataset_dir} on" `date`
 echo ---------------------------------------------------------------------
 lang=data/lang
-if ! $skip_kws ; then
-  if  $extra_kws ; then
-    L1_lex=data/local/lexiconp.txt
-    . ./local/datasets/extra_kws.sh || exit 1
-  fi
-  if  $vocab_kws ; then
-    . ./local/datasets/vocab_kws.sh || exit 1
-  fi
-  if [ ! -f data/lang.phn/G.fst ] ; then
-    ./local/syllab/run_phones.sh --stage -2 ${dataset_dir}
-  else
-    ./local/syllab/run_phones.sh ${dataset_dir}
-  fi
+if [ ! -f data/dev10h.pem/.done.kws.dev ] ; then
+  if ! $skip_kws  ; then
+    if  $extra_kws ; then
+      L1_lex=data/local/lexiconp.txt
+      . ./local/datasets/extra_kws.sh || exit 1
+    fi
+    if  $vocab_kws ; then
+      . ./local/datasets/vocab_kws.sh || exit 1
+    fi
+    if [ ! -f data/lang.phn/G.fst ] ; then
+      ./local/syllab/run_phones.sh --stage -2 ${dataset_dir}
+    else
+      ./local/syllab/run_phones.sh ${dataset_dir}
+    fi
 
-  if [ ! -f data/lang.syll/G.fst ] ; then
-    ./local/syllab/run_syllabs.sh --stage -2  ${dataset_dir}
-  else
-    ./local/syllab/run_syllabs.sh ${dataset_dir}
-  fi
+    if [ ! -f data/lang.syll/G.fst ] ; then
+      ./local/syllab/run_syllabs.sh --stage -2  ${dataset_dir}
+    else
+      ./local/syllab/run_syllabs.sh ${dataset_dir}
+    fi
 
-  ./local/search/run_search.sh --dir ${dataset_dir##*/}
-  ./local/search/run_phn_search.sh --dir ${dataset_dir##*/}
-  ./local/search/run_syll_search.sh --dir ${dataset_dir##*/}
+    ./local/search/run_search.sh --dir ${dataset_dir##*/}
+    ./local/search/run_phn_search.sh --dir ${dataset_dir##*/}
+    ./local/search/run_syll_search.sh --dir ${dataset_dir##*/}
+  fi
 fi
 
 if $data_only ; then
@@ -379,72 +397,6 @@ if $tri5_only; then
   exit 0
 fi
 
-####################################################################
-## SGMM2 decoding
-## We Include the SGMM_MMI inside this, as we might only have the DNN systems
-## trained and not PLP system. The DNN systems build only on the top of tri5 stage
-####################################################################
-if [ -f exp/sgmm5/.done ]; then
-  decode=exp/sgmm5/decode_fmllr_${dataset_id}
-  if [ ! -f $decode/.done ]; then
-    echo ---------------------------------------------------------------------
-    echo "Spawning $decode on" `date`
-    echo ---------------------------------------------------------------------
-    utils/mkgraph.sh \
-      data/langp_test exp/sgmm5 exp/sgmm5/graph |tee exp/sgmm5/mkgraph.log
-
-    mkdir -p $decode
-    steps/decode_sgmm2.sh --skip-scoring true --use-fmllr true --nj $my_nj \
-      --cmd "$decode_cmd" --transform-dir exp/tri5/decode_${dataset_id} "${decode_extra_opts[@]}"\
-      exp/sgmm5/graph ${dataset_dir} $decode |tee $decode/decode.log
-    touch $decode/.done
-
-    if ! $fast_path ; then
-      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
-        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
-        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
-        "${lmwt_plp_extra_opts[@]}" \
-        ${dataset_dir} data/langp_test  exp/sgmm5/decode_fmllr_${dataset_id}
-    fi
-  fi
-
-  ####################################################################
-  ##
-  ## SGMM_MMI rescoring
-  ##
-  ####################################################################
-
-  for iter in 1 2 3 4; do
-      # Decode SGMM+MMI (via rescoring).
-    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
-    if [ -x exp/sgmm5_mmi_b0.1 ] && [ ! -f $decode/.done ]; then
-
-      mkdir -p $decode
-      steps/decode_sgmm2_rescore.sh  --skip-scoring true \
-        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5/decode_${dataset_id} \
-        data/langp_test ${dataset_dir} exp/sgmm5/decode_fmllr_${dataset_id} $decode | tee ${decode}/decode.log
-
-      touch $decode/.done
-    fi
-  done
-
-  #We are done -- all lattices has been generated. We have to
-  #a)Run MBR decoding
-  #b)Run KW search
-  for iter in 1 2 3 4; do
-    # Decode SGMM+MMI (via rescoring).
-    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
-    if [ -f $decode/.done ]; then
-      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
-        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
-        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
-      "${lmwt_plp_extra_opts[@]}" \
-      ${dataset_dir} data/langp_test $decode
-    fi
-  done
-fi
-
-
 
 ####################################################################
 ##
@@ -476,10 +428,13 @@ fi
 ## nnet3 model decoding
 ##
 ####################################################################
-if [ -f exp/nnet3/lstm_bidirectional_sp/.done ]; then
+if [ -f exp/nnet3/lstm_bidirectional_sp/final.mdl ]; then
   decode=exp/nnet3/lstm_bidirectional_sp/decode_${dataset_id}
   rnn_opts=" --extra-left-context 40 --extra-right-context 40  --frames-per-chunk 20 "
   decode_script=steps/nnet3/decode.sh
+  my_nj_backup=$my_nj
+  echo "Modifying the number of jobs as this is an RNN and decoding can be extremely slow."
+  my_nj=`cat ${dataset_dir}_hires/spk2utt|wc -l`
   if [ ! -f $decode/.done ]; then
     mkdir -p $decode
     $decode_script --nj $my_nj --cmd "$decode_cmd" $rnn_opts \
@@ -496,9 +451,11 @@ if [ -f exp/nnet3/lstm_bidirectional_sp/.done ]; then
     --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
     "${lmwt_dnn_extra_opts[@]}" \
     ${dataset_dir} data/langp_test $decode
+
+  my_nj=$my_nj_backup
 fi
 
-if [ -f exp/nnet3/lstm_realigned_bidirectional_sp//.done ]; then
+if [ -f exp/nnet3/lstm_realigned_bidirectional_sp/final.mdl ]; then
   decode=exp/nnet3/lstm_realigned_bidirectional_sp//decode_${dataset_id}
   rnn_opts=" --extra-left-context 40 --extra-right-context 40  --frames-per-chunk 20 "
   decode_script=steps/nnet3/decode.sh
@@ -519,7 +476,7 @@ if [ -f exp/nnet3/lstm_realigned_bidirectional_sp//.done ]; then
     "${lmwt_dnn_extra_opts[@]}" \
     ${dataset_dir} data/langp_test $decode
 fi
-if [ -f exp/nnet3/lstm_sp/.done ]; then
+if [ -f exp/nnet3/lstm_sp/final.mdl ]; then
   decode=exp/nnet3/lstm_sp/decode_${dataset_id}
   rnn_opts=" --extra-left-context 40 --extra-right-context 0  --frames-per-chunk 20 "
   decode_script=steps/nnet3/decode.sh
@@ -541,7 +498,7 @@ if [ -f exp/nnet3/lstm_sp/.done ]; then
     ${dataset_dir} data/langp_test $decode
 fi
 
-if [ -f exp/$nnet3_model/.done ]; then
+if [ -f exp/$nnet3_model/final.mdl ]; then
   decode=exp/$nnet3_model/decode_${dataset_id}
   rnn_opts=
   decode_script=steps/nnet3/decode.sh
@@ -583,6 +540,7 @@ if [ -f exp/$chain_model/final.mdl ]; then
     touch exp/nnet3$parent_dir_suffix/ivectors_${dataset_id}/.done
   fi
 
+  my_nj_backup=$my_nj
   rnn_opts=
   if [ "$is_rnn" == "true" ]; then
     rnn_opts=" --extra-left-context $extra_left_context --extra-right-context $extra_right_context  --frames-per-chunk $frames_per_chunk "
@@ -608,6 +566,7 @@ if [ -f exp/$chain_model/final.mdl ]; then
     --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
     "${lmwt_chain_extra_opts[@]}" \
     ${dataset_dir} data/langp_test $decode
+  my_nj=$my_nj_backup
 else
   echo "no chain model exp/$chain_model"
 fi
@@ -720,5 +679,72 @@ for dnn in tri6_nnet_semi_supervised tri6_nnet_semi_supervised2 \
       ${dataset_dir} data/langp_test $decode
   fi
 done
+
+####################################################################
+## SGMM2 decoding
+## We Include the SGMM_MMI inside this, as we might only have the DNN systems
+## trained and not PLP system. The DNN systems build only on the top of tri5 stage
+####################################################################
+if [ -f exp/sgmm5/.done ]; then
+  decode=exp/sgmm5/decode_fmllr_${dataset_id}
+  if [ ! -f $decode/.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Spawning $decode on" `date`
+    echo ---------------------------------------------------------------------
+    utils/mkgraph.sh \
+      data/langp_test exp/sgmm5 exp/sgmm5/graph |tee exp/sgmm5/mkgraph.log
+
+    mkdir -p $decode
+    steps/decode_sgmm2.sh --skip-scoring true --use-fmllr true --nj $my_nj \
+      --cmd "$decode_cmd" --transform-dir exp/tri5/decode_${dataset_id} "${decode_extra_opts[@]}"\
+      exp/sgmm5/graph ${dataset_dir} $decode |tee $decode/decode.log
+    touch $decode/.done
+
+    if ! $fast_path ; then
+      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
+        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
+        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
+        "${lmwt_plp_extra_opts[@]}" \
+        ${dataset_dir} data/langp_test  exp/sgmm5/decode_fmllr_${dataset_id}
+    fi
+  fi
+
+  ####################################################################
+  ##
+  ## SGMM_MMI rescoring
+  ##
+  ####################################################################
+
+  for iter in 1 2 3 4; do
+      # Decode SGMM+MMI (via rescoring).
+    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
+    if [ -x exp/sgmm5_mmi_b0.1 ] && [ ! -f $decode/.done ]; then
+
+      mkdir -p $decode
+      steps/decode_sgmm2_rescore.sh  --skip-scoring true \
+        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5/decode_${dataset_id} \
+        data/langp_test ${dataset_dir} exp/sgmm5/decode_fmllr_${dataset_id} $decode | tee ${decode}/decode.log
+
+      touch $decode/.done
+    fi
+  done
+
+  #We are done -- all lattices has been generated. We have to
+  #a)Run MBR decoding
+  #b)Run KW search
+  for iter in 1 2 3 4; do
+    # Decode SGMM+MMI (via rescoring).
+    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
+    if [ -f $decode/.done ]; then
+      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
+        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
+        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
+      "${lmwt_plp_extra_opts[@]}" \
+      ${dataset_dir} data/langp_test $decode
+    fi
+  done
+fi
+
+
 echo "Everything looking good...."
 exit 0

From db27674c535357e11fbfc08f1bba10136e7fba7d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 23 Mar 2017 16:05:02 -0400
Subject: [PATCH 199/213] [src,scripts]: Several unrelated cosmetic changes

---
 egs/wsj/s5/steps/train_lda_mllt.sh    | 24 ++++++++++++------------
 egs/wsj/s5/utils/validate_data_dir.sh |  2 ++
 src/feat/feature-window.h             |  3 ++-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/egs/wsj/s5/steps/train_lda_mllt.sh b/egs/wsj/s5/steps/train_lda_mllt.sh
index 8b5e19ec8d1..363df34a3cd 100755
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
@@ -95,7 +95,7 @@ feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"
 
 if [ $stage -le -5 ]; then
   if [ -z "$use_lda_mat" ]; then
-    echo "Accumulating LDA statistics."
+    echo "$0: Accumulating LDA statistics."
     rm $dir/lda.*.acc 2>/dev/null
     $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
@@ -106,11 +106,11 @@ if [ $stage -le -5 ]; then
       2>$dir/log/lda_est.log || exit 1;
     rm $dir/lda.*.acc
   else
-    echo "Using supplied LDA matrix $use_lda_mat"
+    echo "$0: Using supplied LDA matrix $use_lda_mat"
     cp $use_lda_mat $dir/0.mat || exit 1;
     [ ! -z "$mllt_iters" ] && \
-      echo "Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \
-      echo "which you might not want; to disable MLLT, specify --mllt-iters ''" && \
+      echo "$0: Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \
+      echo "     which you might not want; to disable MLLT, specify --mllt-iters ''" && \
       sleep 5
   fi
 fi
@@ -118,12 +118,12 @@ fi
 cur_lda_iter=0
 
 if [ $stage -le -4 ] && $train_tree; then
-  echo "Accumulating tree stats"
+  echo "$0: Accumulating tree stats"
   $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
     acc-tree-stats $context_opts \
     --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
-  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "Wrong #tree-accs" && exit 1;
+  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
   $cmd $dir/log/sum_tree_acc.log \
     sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
   rm $dir/*.treeacc
@@ -131,7 +131,7 @@ fi
 
 
 if [ $stage -le -3 ] && $train_tree; then
-  echo "Getting questions for tree clustering."
+  echo "$0: Getting questions for tree clustering."
   # preparing questions, roots file...
   cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int \
     $dir/questions.int 2> $dir/log/questions.log || exit 1;
@@ -139,7 +139,7 @@ if [ $stage -le -3 ] && $train_tree; then
   compile-questions $context_opts $lang/topo $dir/questions.int \
     $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
 
-  echo "Building the tree"
+  echo "$0: Building the tree"
   $cmd $dir/log/build_tree.log \
     build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
     --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
@@ -164,14 +164,14 @@ fi
 
 if [ $stage -le -1 ]; then
   # Convert the alignments.
-  echo "Converting alignments from $alidir to use current tree"
+  echo "$0: Converting alignments from $alidir to use current tree"
   $cmd JOB=1:$nj $dir/log/convert.JOB.log \
     convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
      "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
 fi
 
 if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
-  echo "Compiling graphs of transcripts"
+  echo "$0: Compiling graphs of transcripts"
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
     compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
      "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
@@ -192,7 +192,7 @@ while [ $x -lt $num_iters ]; do
   fi
   if echo $mllt_iters | grep -w $x >/dev/null; then
     if [ $stage -le $x ]; then
-      echo "Estimating MLLT"
+      echo "$0: Estimating MLLT"
       $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
         ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
         weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
@@ -233,6 +233,6 @@ utils/summarize_warnings.pl $dir/log
 
 steps/info/gmm_dir_info.pl $dir
 
-echo "Done training system with LDA+MLLT features in $dir"
+echo "$0: Done training system with LDA+MLLT features in $dir"
 
 exit 0
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 58e51a75aef..7e93b0f8400 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -22,6 +22,8 @@ done
 
 if [ $# -ne 1 ]; then
   echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] <data-dir>"
+  echo "The --no-xxx options mean that the script does not require "
+  echo "xxx.scp to be present, but it will check it if it is present."
   echo "e.g.: $0 data/train"
   exit 1;
 fi
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index 287f1bf01f6..bbb24fd8988 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -76,7 +76,8 @@ struct FrameExtractionOptions {
     opts->Register("blackman-coeff", &blackman_coeff,
                    "Constant coefficient for generalized Blackman window.");
     opts->Register("round-to-power-of-two", &round_to_power_of_two,
-                   "If true, round window size to power of two.");
+                   "If true, round window size to power of two by zero-padding "
+                   "input to FFT.");
     opts->Register("snip-edges", &snip_edges,
                    "If true, end effects will be handled by outputting only frames that "
                    "completely fit in the file, and the number of frames depends on the "

From ae82d9039fb6d5140479ca88e41516add767c983 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Mon, 27 Mar 2017 21:47:28 +0200
Subject: [PATCH 200/213] [misc] remove eXecute permissions where not needed
 (#1515)

---
 src/doc/Kaldi.pptx              | Bin
 src/doc/KaldiMatrix.pptx        | Bin
 src/doc/KaldiModels.pptx        | Bin
 src/doc/KaldiScripts.pptx       | Bin
 src/doc/README                  |   0
 src/doc/examples.dox            |   0
 src/lmbin/arpa2fst.cc           |   0
 src/matrix/Matrix.vcxproj       |   0
 src/nnet/nnet-parametric-relu.h |   0
 9 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 src/doc/Kaldi.pptx
 mode change 100755 => 100644 src/doc/KaldiMatrix.pptx
 mode change 100755 => 100644 src/doc/KaldiModels.pptx
 mode change 100755 => 100644 src/doc/KaldiScripts.pptx
 mode change 100755 => 100644 src/doc/README
 mode change 100755 => 100644 src/doc/examples.dox
 mode change 100755 => 100644 src/lmbin/arpa2fst.cc
 mode change 100755 => 100644 src/matrix/Matrix.vcxproj
 mode change 100755 => 100644 src/nnet/nnet-parametric-relu.h

diff --git a/src/doc/Kaldi.pptx b/src/doc/Kaldi.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/KaldiMatrix.pptx b/src/doc/KaldiMatrix.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/KaldiModels.pptx b/src/doc/KaldiModels.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/KaldiScripts.pptx b/src/doc/KaldiScripts.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/README b/src/doc/README
old mode 100755
new mode 100644
diff --git a/src/doc/examples.dox b/src/doc/examples.dox
old mode 100755
new mode 100644
diff --git a/src/lmbin/arpa2fst.cc b/src/lmbin/arpa2fst.cc
old mode 100755
new mode 100644
diff --git a/src/matrix/Matrix.vcxproj b/src/matrix/Matrix.vcxproj
old mode 100755
new mode 100644
diff --git a/src/nnet/nnet-parametric-relu.h b/src/nnet/nnet-parametric-relu.h
old mode 100755
new mode 100644

From e5d1de39196fe6a1475d5375bc4d86c670e68871 Mon Sep 17 00:00:00 2001
From: meixu song <meixu.asr@gmail.com>
Date: Tue, 28 Mar 2017 13:20:43 -0700
Subject: [PATCH 201/213] [egs] Fix to egs/wsj/s5/run.sh (unset variable)
 (#1517)

---
 egs/wsj/s5/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index 4d505f5da3a..de0c96fe387 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -123,7 +123,7 @@ if [ $stage -le 2 ]; then
 
     for data in dev93 eval92; do
       nspk=$(wc -l <data/test_${data}/spk2utt)
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
+      steps/decode.sh --nj $nspk --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
         data/test_${data} exp/tri1/decode_nosp_tgpr_${data} || exit 1;
 
       # test various modes of LM rescoring (4 is the default one).

From e1b7916bc403e5705faf683004f322ddc1504c07 Mon Sep 17 00:00:00 2001
From: Tom Ko <tomkocse@gmail.com>
Date: Thu, 30 Mar 2017 00:01:09 +0800
Subject: [PATCH 202/213] [src] Adding noexcept to hashing function objects
 (#1519)

this improves speed when using hashes.
---
 src/cudamatrix/cu-allocator.h             | 4 ++--
 src/nnet3/nnet-chain-example.cc           | 2 +-
 src/nnet3/nnet-chain-example.h            | 4 ++--
 src/nnet3/nnet-chain-training.cc          | 2 +-
 src/nnet3/nnet-common.cc                  | 8 ++++----
 src/nnet3/nnet-common.h                   | 8 ++++----
 src/nnet3/nnet-computation.cc             | 2 +-
 src/nnet3/nnet-computation.h              | 2 +-
 src/nnet3/nnet-discriminative-example.cc  | 2 +-
 src/nnet3/nnet-discriminative-example.h   | 4 ++--
 src/nnet3/nnet-discriminative-training.cc | 3 ++-
 src/nnet3/nnet-example.cc                 | 4 ++--
 src/nnet3/nnet-example.h                  | 8 +++++---
 src/nnet3/nnet-optimize-utils.cc          | 2 +-
 src/nnet3/nnet-optimize.cc                | 3 ++-
 src/nnet3/nnet-optimize.h                 | 2 +-
 src/nnet3/nnet-training.cc                | 2 +-
 src/util/stl-utils.h                      | 6 +++---
 18 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index b10601b8245..c6500e95559 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -141,7 +141,7 @@ class CuMemoryAllocator {
       // be a multiple of 4, and num_rows will frequently be a multiple of
       // powers of 2 also.  We need to shift right and add so that there will be
       // some action in the lower-order bits.
-      size_t operator () (const std::pair<size_t,size_t> &p) const {
+      size_t operator () (const std::pair<size_t,size_t> &p) const noexcept {
         size_t temp = p.first + 1867 * p.second;
         return temp + (temp >> 2) + (temp >> 8);
       }
@@ -206,7 +206,7 @@ class CuMemoryAllocator {
   };
 
   struct PointerHasher {
-    size_t operator() (const void *arg) const {
+    size_t operator() (const void *arg) const noexcept {
       // the last few bits tend to be very predictable, for alignment reasons (CUDA
       // allocation may align on 256 byte or 512 byte boundaries or something similar).
       size_t temp = reinterpret_cast<size_t>(arg);
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 005107a097c..351312fb952 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -401,7 +401,7 @@ void ShiftChainExampleTimes(int32 frame_shift,
 
 
 size_t NnetChainExampleStructureHasher::operator () (
-    const NnetChainExample &eg) const {
+    const NnetChainExample &eg) const noexcept {
   // these numbers were chosen at random from a list of primes.
   NnetIoStructureHasher io_hasher;
   size_t size = eg.inputs.size(), ans = size * 35099;
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 7a024f3bfcd..2718af746b2 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -135,9 +135,9 @@ struct NnetChainExample {
 /// without looking at the value of the features.  It will be used in combining
 /// egs into batches of all similar structure.
 struct NnetChainExampleStructureHasher {
-  size_t operator () (const NnetChainExample &eg) const;
+  size_t operator () (const NnetChainExample &eg) const noexcept;
   // We also provide a version of this that works from pointers.
-  size_t operator () (const NnetChainExample *eg) const {
+  size_t operator () (const NnetChainExample *eg) const noexcept {
     return (*this)(*eg);
   }
 };
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index c3ae3ae0336..5fe28e8142b 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -231,7 +231,7 @@ void NnetChainTrainer::UpdateParamsWithMaxChange() {
 }
 
 bool NnetChainTrainer::PrintTotalStats() const {
-  unordered_map<std::string, ObjectiveFunctionInfo>::const_iterator
+  unordered_map<std::string, ObjectiveFunctionInfo, StringHasher>::const_iterator
       iter = objf_info_.begin(),
       end = objf_info_.end();
   bool ans = false;
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 906217c3561..6c4fc0f7a1c 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -342,14 +342,14 @@ void ReadCindexVector(std::istream &is, bool binary,
   }
 }
 
-size_t IndexHasher::operator () (const Index &index) const {
+size_t IndexHasher::operator () (const Index &index) const noexcept {
   // The numbers that appear below were chosen arbitrarily from a list of primes
   return index.n +
       1619 * index.t +
       15649 * index.x;
 }
 
-size_t CindexHasher::operator () (const Cindex &cindex) const {
+size_t CindexHasher::operator () (const Cindex &cindex) const noexcept {
   // The numbers that appear below were chosen arbitrarily from a list of primes
   return cindex.first +
        1619 * cindex.second.n +
@@ -359,7 +359,7 @@ size_t CindexHasher::operator () (const Cindex &cindex) const {
 }
 
 size_t CindexVectorHasher::operator () (
-    const std::vector<Cindex> &cindex_vector) const {
+    const std::vector<Cindex> &cindex_vector) const noexcept {
   // this is an arbitrarily chosen prime.
   size_t kPrime = 23539, ans = 0;
   std::vector<Cindex>::const_iterator iter = cindex_vector.begin(),
@@ -371,7 +371,7 @@ size_t CindexVectorHasher::operator () (
 }
 
 size_t IndexVectorHasher::operator () (
-    const std::vector<Index> &index_vector) const {
+    const std::vector<Index> &index_vector) const noexcept {
   size_t n1 = 15, n2 = 10;  // n1 and n2 are used to extract only a subset of
                             // elements to hash; this makes the hasher faster by
                             // skipping over more elements.  Setting n1 large or
diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index 9134e2545de..3f80645ec22 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -106,21 +106,21 @@ void ReadIndexVector(std::istream &is, bool binary,
 typedef std::pair<int32, Index> Cindex;
 
 struct IndexHasher {
-  size_t operator () (const Index &cindex) const;
+  size_t operator () (const Index &cindex) const noexcept;
 };
 
 struct CindexHasher {
-  size_t operator () (const Cindex &cindex) const;
+  size_t operator () (const Cindex &cindex) const noexcept;
 };
 
 struct CindexVectorHasher {
-  size_t operator () (const std::vector<Cindex> &cindex_vector) const;
+  size_t operator () (const std::vector<Cindex> &cindex_vector) const noexcept;
 };
 
 // Note: because IndexVectorHasher is used in some things where we really need
 // it to be fast, it doesn't look at all the indexes, just most of them.
 struct IndexVectorHasher {
-  size_t operator () (const std::vector<Index> &index_vector) const;
+  size_t operator () (const std::vector<Index> &index_vector) const noexcept;
 };
 
 
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 5be1b7def94..ec1214279ff 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -1140,7 +1140,7 @@ void NnetComputation::GetWholeSubmatrices(
 }
 
 size_t IoSpecificationHasher::operator () (
-    const IoSpecification &io_spec) const {
+    const IoSpecification &io_spec) const noexcept {
   StringHasher string_hasher;
   IndexVectorHasher indexes_hasher;
   // 4261 was chosen at random from a list of primes.
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index c7972da2102..623e136dd43 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -100,7 +100,7 @@ struct IoSpecification {
 };
 
 struct IoSpecificationHasher {
-  size_t operator () (const IoSpecification &io_spec) const;
+  size_t operator () (const IoSpecification &io_spec) const noexcept;
 };
 
 
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index aa7eb48ea04..61a9669fb76 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -396,7 +396,7 @@ void ShiftDiscriminativeExampleTimes(int32 frame_shift,
 }
 
 size_t NnetDiscriminativeExampleStructureHasher::operator () (
-    const NnetDiscriminativeExample &eg) const {
+    const NnetDiscriminativeExample &eg) const noexcept {
   // these numbers were chosen at random from a list of primes.
   NnetIoStructureHasher io_hasher;
   size_t size = eg.inputs.size(), ans = size * 35099;
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index ba1cac7ffbe..c0ea446552e 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -134,9 +134,9 @@ struct NnetDiscriminativeExample {
 /// without looking at the value of the features.  It will be used in combining
 /// egs into batches of all similar structure.
 struct NnetDiscriminativeExampleStructureHasher {
-  size_t operator () (const NnetDiscriminativeExample &eg) const;
+  size_t operator () (const NnetDiscriminativeExample &eg) const noexcept ;
   // We also provide a version of this that works from pointers.
-  size_t operator () (const NnetDiscriminativeExample *eg) const {
+  size_t operator () (const NnetDiscriminativeExample *eg) const noexcept {
     return (*this)(*eg);
   }
 };
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index fb4b7db8c3c..0a436b69f8c 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -189,7 +189,8 @@ void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &
 
 
 bool NnetDiscriminativeTrainer::PrintTotalStats() const {
-  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo>::const_iterator
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo,
+    StringHasher>::const_iterator
       iter = objf_info_.begin(),
       end = objf_info_.end();
   bool ans = false;
diff --git a/src/nnet3/nnet-example.cc b/src/nnet3/nnet-example.cc
index 3e87ebba3f5..c011f2a0b8a 100644
--- a/src/nnet3/nnet-example.cc
+++ b/src/nnet3/nnet-example.cc
@@ -124,7 +124,7 @@ void NnetExample::Compress() {
 
 
 size_t NnetIoStructureHasher::operator () (
-    const NnetIo &io) const {
+    const NnetIo &io) const noexcept {
   StringHasher string_hasher;
   IndexVectorHasher indexes_hasher;
 
@@ -147,7 +147,7 @@ bool NnetIoStructureCompare::operator () (
 
 
 size_t NnetExampleStructureHasher::operator () (
-    const NnetExample &eg) const {
+    const NnetExample &eg) const noexcept {
   // these numbers were chosen at random from a list of primes.
   NnetIoStructureHasher io_hasher;
   size_t size = eg.io.size(), ans = size * 35099;
diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h
index f08754a2bd3..347894e958c 100644
--- a/src/nnet3/nnet-example.h
+++ b/src/nnet3/nnet-example.h
@@ -79,7 +79,7 @@ struct NnetIo {
 /// (name, indexes, feature dimension) without looking at the value of features.
 /// It will be used in combining egs into batches of all similar structure.
 struct NnetIoStructureHasher {
-  size_t operator () (const NnetIo &a) const;
+  size_t operator () (const NnetIo &a) const noexcept;
 };
 /// This comparison object compares just the structural aspects of the NnetIo
 /// object (name, indexes, feature dimension) without looking at the value of
@@ -130,9 +130,11 @@ struct NnetExample {
 /// deal with differently-ordered, but otherwise identical, egs in practice so
 /// we don't bother making the hashing function independent of this order.
 struct NnetExampleStructureHasher {
-  size_t operator () (const NnetExample &eg) const;
+  size_t operator () (const NnetExample &eg) const noexcept;
   // We also provide a version of this that works from pointers.
-  size_t operator () (const NnetExample *eg) const { return (*this)(*eg); }
+  size_t operator () (const NnetExample *eg) const noexcept {
+    return (*this)(*eg);
+  }
 };
 
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index adcd5fe22f0..60ec93f3f18 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -188,7 +188,7 @@ class ComputationRenumberer {
 
   struct SubMatrixHasher {
     SubMatrixHasher() { }
-    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
+    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const noexcept {
       // these numbers are arbitrarily chosen primes.
       return submat.matrix_index +
           19553 * submat.row_offset +
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 30b5f57feb7..33091674bd4 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -533,7 +533,8 @@ void Optimize(const NnetOptimizeOptions &config,
 
 // ComputationRequests are distinguished by the names and indexes
 // of inputs and outputs
-size_t ComputationRequestHasher::operator() (const ComputationRequest *cr) const {
+size_t ComputationRequestHasher::operator() (
+    const ComputationRequest *cr) const noexcept {
   size_t ans = 0;
   size_t p1 = 4111, p2 = 26951;
   IoSpecificationHasher io_hasher;
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 538dde2bbc1..cb14060996b 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -168,7 +168,7 @@ void Optimize(const NnetOptimizeOptions &config,
 // ComputationRequest to hash code by looking at input
 // and output IoSpecifications vectors.
 struct ComputationRequestHasher {
-  size_t operator()(const ComputationRequest *cr) const;
+  size_t operator()(const ComputationRequest *cr) const noexcept;
 };
 
 // Equality function for ComputationRequest pointer
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 6bac172b5bd..2a081920738 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -180,7 +180,7 @@ void NnetTrainer::UpdateParamsWithMaxChange() {
 }
 
 bool NnetTrainer::PrintTotalStats() const {
-  unordered_map<std::string, ObjectiveFunctionInfo>::const_iterator
+  unordered_map<std::string, ObjectiveFunctionInfo, StringHasher>::const_iterator
       iter = objf_info_.begin(),
       end = objf_info_.end();
   bool ans = false;
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index 95ca0b03c5a..a1506f557a7 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -216,7 +216,7 @@ void CopyVectorToVector(const std::vector<A> &vec_in, std::vector<B> *vec_out) {
 /// A hashing function-object for vectors.
 template<typename Int>
 struct VectorHasher {  // hashing function for vector<Int>.
-  size_t operator()(const std::vector<Int> &x) const {
+  size_t operator()(const std::vector<Int> &x) const noexcept {
     size_t ans = 0;
     typename std::vector<Int>::const_iterator iter = x.begin(), end = x.end();
     for (; iter != end; ++iter) {
@@ -235,7 +235,7 @@ struct VectorHasher {  // hashing function for vector<Int>.
 /// A hashing function-object for pairs of ints
 template<typename Int1, typename Int2 = Int1>
 struct PairHasher {  // hashing function for pair<int>
-  size_t operator()(const std::pair<Int1, Int2> &x) const {
+  size_t operator()(const std::pair<Int1, Int2> &x) const noexcept {
     // 7853 was chosen at random from a list of primes.
     return x.first + x.second * 7853;
   }
@@ -248,7 +248,7 @@ struct PairHasher {  // hashing function for pair<int>
 
 /// A hashing function object for strings.
 struct StringHasher {  // hashing function for std::string
-  size_t operator()(const std::string &str) const {
+  size_t operator()(const std::string &str) const noexcept {
     size_t ans = 0, len = str.length();
     const char *c = str.c_str(), *end = c + len;
     for (; c != end; c++) {

From 7a7368937aff8010f7801189acc25d20bfb280e9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 29 Mar 2017 18:11:12 -0400
Subject: [PATCH 203/213] [src,doc] Fix several unrelated minor problems. 
 Thanks: gaoxinglong

---
 src/doc/dnn3_scripts_context.dox | 2 +-
 src/nnet3/nnet-optimize.cc       | 2 +-
 src/nnet3/nnet-utils.h           | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/doc/dnn3_scripts_context.dox b/src/doc/dnn3_scripts_context.dox
index 43ee0d40260..884e8c79f51 100644
--- a/src/doc/dnn3_scripts_context.dox
+++ b/src/doc/dnn3_scripts_context.dox
@@ -49,7 +49,7 @@ namespace nnet3 {
    compute this output without seeing a range of input frames.  For example,
    it may be impossible to compute the output without seeing the range of
    't' values from t = 150 through t = 157.  In this case (glossing over details),
-   we'd say that the network has a \b left-context of 3 and a \b right-context of 4.
+   we'd say that the network has a \b left-context of 4 and a \b right-context of 3.
    The actual computation of the context is a bit more complex as it has to
    take into account special cases like where, say, the behavior for odd and
    even 't' values is different (c.f. Round() descriptors in
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 33091674bd4..abafedc2f2d 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -863,7 +863,7 @@ void FixGotoOutputReordering(const Nnet &nnet,
   FixGotoLabel(computation);  // make sure the destination label of the goto statement was
                               // correct.
   int32 goto_command_index = -1;
-  for (int32 c = computation->commands.size(); c >= 0; c--)
+  for (int32 c = computation->commands.size() - 1; c >= 0; c--)
     if (computation->commands[c].command_type == kGotoLabel)
       goto_command_index = c;
   KALDI_ASSERT(goto_command_index > 0);
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 766b0ed1798..921f1f1901d 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -81,7 +81,7 @@ std::string PrintVectorPerUpdatableComponent(const Nnet &nnet,
 
 /// This function returns true if the nnet has the following properties:
 ///  It has an output called "output" (other outputs are allowed but may be
-///          ignored).
+///  ignored).
 ///  It has an input called "input", and possibly an extra input called
 ///    "ivector", but no other inputs.
 ///  There are probably some other properties that we really ought to
@@ -160,8 +160,8 @@ void ConvertRepeatedToBlockAffine(Nnet *nnet);
 /// Info() function (we need this in the CTC code).
 std::string NnetInfo(const Nnet &nnet);
 
-/// This function sets the dropout proportion in all dropout component to
-/// dropout_proportion value.
+/// This function sets the dropout proportion in all dropout components to
+/// the value 'dropout_proportion'
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
 /// This function finds a list of components that are never used, and outputs

From e72c15ce9b7064d587e26532f3067ceeb492ff8f Mon Sep 17 00:00:00 2001
From: Yiming Wang <freewym@gmail.com>
Date: Fri, 31 Mar 2017 12:38:45 -0400
Subject: [PATCH 204/213] [src] (minor) Added missing SetZero() to
 NaturalGradientAffineComponent::Scale() if scale==0.0 (#1522)

---
 src/nnet3/nnet-simple-component.cc | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index d352c4ae282..2692eb7271c 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2761,11 +2761,19 @@ void NaturalGradientAffineComponent::ZeroStats()  {
 }
 
 void NaturalGradientAffineComponent::Scale(BaseFloat scale) {
-  update_count_ *= scale;
-  max_change_scale_stats_ *= scale;
-  active_scaling_count_ *= scale;
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    update_count_ = 0.0;
+    max_change_scale_stats_ = 0.0;
+    active_scaling_count_ = 0.0;
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    update_count_ *= scale;
+    max_change_scale_stats_ *= scale;
+    active_scaling_count_ *= scale;
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other_in) {

From c7edaecab1ecb28cb323a92cedfe11389c5b9e52 Mon Sep 17 00:00:00 2001
From: david-ryan-snyder <david.ryan.snyder@gmail.com>
Date: Sat, 1 Apr 2017 21:12:55 -0400
Subject: [PATCH 205/213] [scripts,egs] Adding options for using PCA instead of
 LDA+MLLT for ivectors used in ASR. Results are reported in the default TDNN
 recipe in AMI. Updating
 steps/online/nnet2/{train_diag_ubm.sh,train_ivector_extractor.sh} so that
 they now backup the contents of their destination directory if it already
 exists. (#1514)

---
 egs/ami/s5b/RESULTS_ihm                       |  21 +-
 egs/ami/s5b/local/chain/run_tdnn.sh           |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh | 269 ++++++++++++++++++
 egs/ami/s5b/local/nnet3/run_ivector_common.sh |  44 ++-
 .../steps/online/nnet2/get_pca_transform.sh   |  67 +++++
 .../s5/steps/online/nnet2/train_diag_ubm.sh   |  35 ++-
 .../online/nnet2/train_ivector_extractor.sh   |  29 +-
 7 files changed, 425 insertions(+), 42 deletions(-)
 create mode 100755 egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
 create mode 100755 egs/wsj/s5/steps/online/nnet2/get_pca_transform.sh

diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index 44234fc3fd9..25a60d24cfb 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -40,7 +40,6 @@
 %WER 24.0 | 13098 94470 | 79.4 12.1 8.5 3.4 24.0 57.1 | -0.153 | exp/ihm/nnet3_cleaned/tdnn_sp/decode_dev/ascore_12/dev_hires.ctm.filt.sys
 %WER 25.5 | 12643 89984 | 77.7 14.2 8.2 3.2 25.5 56.4 | -0.139 | exp/ihm/nnet3_cleaned/tdnn_sp/decode_eval/ascore_11/eval_hires.ctm.filt.sys
 
-
 # local/nnet3/run_tdnn.sh --mic ihm --train-set train --gmm tri3 --nnet3-affix ""
 # nnet3 xent TDNN without data cleaning [cleaning makes very small and
 #  inconsistent difference on this dat]
@@ -55,17 +54,21 @@
 %WER 22.4 | 12643 89977 | 80.3 12.5 7.2 2.7 22.4 53.6 | -0.503 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
 ############################################
-
-# local/chain/run_tdnn.sh --mic ihm --stage 12 &
-# cleanup + chain TDNN model
-# for d in exp/ihm/chain_cleaned/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
-%WER 22.5 | 13098 94490 | 80.6 10.8 8.6 3.1 22.5 55.0 | 0.072 | exp/ihm/chain_cleaned/tdnn_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
-%WER 22.5 | 12643 89978 | 80.3 12.5 7.2 2.7 22.5 53.1 | 0.149 | exp/ihm/chain_cleaned/tdnn_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
-
+# cleanup + chain TDNN model.
+# local/chain/run_tdnn.sh --mic ihm --stage 4 &
+# for d in exp/ihm/chain_cleaned/tdnn1d_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+%WER 21.7 | 13098 94488 | 81.1 10.4 8.4 2.8 21.7 54.4 | 0.096 | exp/ihm/chain_cleaned/tdnn1d_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.1 | 12643 89979 | 80.5 12.1 7.4 2.6 22.1 52.8 | 0.185 | exp/ihm/chain_cleaned/tdnn1d_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
+# cleanup + chain TDNN model. Uses LDA instead of PCA for ivector features.
+# local/chain/tuning/run_tdnn_1b.sh --mic ihm --stage 4 &
+# for d in exp/ihm/chain_cleaned/tdnn1b_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+%WER 22.0 | 13098 94488 | 80.8 10.2 9.0 2.8 22.0 54.7 | 0.102 | exp/ihm/chain_cleaned/tdnn1b_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.2 | 12643 89968 | 80.3 12.1 7.6 2.6 22.2 52.9 | 0.170 | exp/ihm/chain_cleaned/tdnn1b_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
 # local/chain/run_tdnn.sh --mic ihm --train-set train --gmm tri3 --nnet3-affix "" --stage 12
 # chain TDNN model without cleanup [note: cleanup helps very little on this IHM data.]
-for d in exp/ihm/chain/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+# for d in exp/ihm/chain/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
 %WER 22.4 | 13098 94476 | 80.4 10.4 9.2 2.8 22.4 54.6 | 0.069 | exp/ihm/chain/tdnn_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
 %WER 22.5 | 12643 89974 | 80.0 12.1 7.9 2.6 22.5 52.8 | 0.157 | exp/ihm/chain/tdnn_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh
index 61f8f499182..e1adaa9346d 120000
--- a/egs/ami/s5b/local/chain/run_tdnn.sh
+++ b/egs/ami/s5b/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1b.sh
\ No newline at end of file
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..a9f228cb55d
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,269 @@
+#!/bin/bash
+
+# same as 1b but uses PCA instead of
+# LDA features for the ivector extractor.
+
+# Results on 03/27/2017:
+# local/chain/compare_wer_general.sh ihm tdnn1b_sp_bi tdnn1d_sp_bi
+# System                   tdnn1b_sp_bi tdnn1d_sp_bi
+# WER on dev               22.0         21.9
+# WER on eval              22.2         22.3
+# Final train prob        -0.0813472   -0.0807054
+# Final valid prob        -0.132032    -0.133564
+# Final train prob (xent) -1.41543     -1.41951
+# Final valid prob (xent) -1.62316     -1.63021
+
+set -e -o pipefail
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+ivector_transform_type=pca
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --ivector-transform-type "$ivector_transform_type" \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/nnet3/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/run_ivector_common.sh
index bccbb42494c..860009c5ef5 100755
--- a/egs/ami/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/ami/s5b/local/nnet3/run_ivector_common.sh
@@ -17,8 +17,8 @@ train_set=train   # you might set this to e.g. train_cleaned.
 gmm=tri3          # This specifies a GMM-dir from the features of the type you're training the system on;
                   # it should contain alignments for 'train_set'.
 
-
 num_threads_ubm=32
+ivector_transform_type=lda
 nnet3_affix=_cleaned     # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/$mic/nnet3_cleaned or whatever.
 
@@ -30,7 +30,7 @@ nnet3_affix=_cleaned     # affix for exp/$mic/nnet3 directory to put iVector stu
 gmmdir=exp/${mic}/${gmm}
 
 
-for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do
+for f in data/${mic}/${train_set}/feats.scp ; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
@@ -110,20 +110,36 @@ if [ $stage -le 4 ]; then
     echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
   fi
 
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmmdir exp/$mic/nnet3${nnet3_affix}/tri5
+  case $ivector_transform_type in
+    lda)
+      if [ ! -f ${gmmdir}/final.mdl ]; then
+        echo "$0: expected file ${gmmdir}/final.mdl to exist"
+        exit 1;
+      fi
+      echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+      if [ -e exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+        # we don't want to overwrite old stuff, ask the user to delete it.
+        echo "$0: exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+        echo " ... please delete and then rerun, or use a later --stage option."
+        exit 1;
+      fi
+      steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+        --splice-opts "--left-context=3 --right-context=3" \
+        3000 10000 $temp_data_root/${train_set}_hires data/lang \
+        $gmmdir exp/$mic/nnet3${nnet3_affix}/tri5
+      ;;
+    pca)
+      echo "$0: computing a PCA transform from the hires data."
+      steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+        --splice-opts "--left-context=3 --right-context=3" \
+        --max-utts 10000 --subsample 2 \
+        $temp_data_root/${train_set}_hires \
+        exp/$mic/nnet3${nnet3_affix}/tri5
+      ;;
+    *) echo "$0: invalid iVector transform type $ivector_transform_type" && exit 1;
+  esac
 fi
 
-
 if [ $stage -le 5 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
 
diff --git a/egs/wsj/s5/steps/online/nnet2/get_pca_transform.sh b/egs/wsj/s5/steps/online/nnet2/get_pca_transform.sh
new file mode 100755
index 00000000000..e0b704f8852
--- /dev/null
+++ b/egs/wsj/s5/steps/online/nnet2/get_pca_transform.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2016  David Snyder
+#
+# This script computes a PCA transform on top of spliced features processed with
+# apply-cmvn-online.
+#
+#
+# Apache 2.0.
+
+# Begin configuration.
+cmd=run.pl
+config=
+stage=0
+dim=40 # The dim after applying PCA
+normalize_variance=true # If the PCA transform normalizes the variance
+normalize_mean=true # If the PCA transform centers
+splice_opts=
+online_cmvn_opts=
+max_utts=5000 # maximum number of files to use
+subsample=5 # subsample features with this periodicity
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: steps/nnet2/get_pca_transform.sh [options] <data> <dir>"
+  echo " e.g.: steps/train_pca_transform.sh data/train_si84 exp/tri2b"
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
+fi
+
+data=$1
+dir=$2
+
+for f in $data/feats.scp ; do
+  [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
+done
+
+mkdir -p $dir/log
+
+echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options
+           # so that later stages of system building can know what they were.
+echo $online_cmvn_opts > $dir/online_cmvn.conf # keep track of options to CMVN.
+
+# create global_cmvn.stats
+if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
+  echo "$0: Error summing cmvn stats"
+  exit 1
+fi
+
+feats="ark,s,cs:utils/subset_scp.pl --quiet $max_utts $data/feats.scp | apply-cmvn-online $online_cmvn_opts $dir/global_cmvn.stats scp:- ark:- | splice-feats $splice_opts ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+
+if [ $stage -le 0 ]; then
+  $cmd $dir/log/pca_est.log \
+    est-pca --dim=$dim --normalize-variance=$normalize_variance \
+    --normalize-mean=$normalize_mean "$feats" $dir/final.mat || exit 1;
+fi
+
+echo "Done estimating PCA transform in $dir"
+
+exit 0
diff --git a/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh b/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
index 22250ae9ee3..80a023fed8a 100755
--- a/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
+++ b/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
@@ -10,15 +10,15 @@
 
 # This script was modified from ../../sre08/v1/sid/train_diag_ubm.sh.  It trains
 # a diagonal UBM on top of features processed with apply-cmvn-online and then
-# transformed with an LDA+MLLT matrix (obtained from the source directory).
-# This script does not use the trained model from the source directory to
-# initialize the diagonal GMM; instead, we initialize the GMM using
+# transformed with an LDA+MLLT or PCA matrix (obtained from the source
+# directory).  This script does not use the trained model from the source
+# directory to initialize the diagonal GMM; instead, we initialize the GMM using
 # gmm-global-init-from-feats, which sets the means to random data points and
 # then does some iterations of E-M in memory.  After the in-memory
-# initialization we train for a few iterations in parallel.
-# Note that there is a slight mismatch in that the source LDA+MLLT matrix
-# (final.mat) will have been estimated using standard CMVN, and we're using
-# online CMVN.  We don't think this will have much effect.
+# initialization we train for a few iterations in parallel.  Note that if an
+# LDA+MLLT transform matrix is used, there will be a slight mismatch in that the
+# source LDA+MLLT matrix (final.mat) will have been estimated using standard
+# CMVN, and we're using online CMVN.  We don't think this will have much effect.
 
 
 # Begin configuration section.
@@ -58,7 +58,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-2>                               # stage to do partial re-run from."
   echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
   echo "                                                   # limit computation to, for speed"
-  echo " --subsample <n|5>                                 # In main E-M phase, use every n" 
+  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
   echo "                                                   # frames (a speedup)"
   echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
   echo "                                                   # for model initialization"
@@ -89,6 +89,15 @@ for f in $data/feats.scp "$online_cmvn_config" $srcdir/splice_opts $srcdir/final
    [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
 done
 
+if [ -d "$dir" ]; then
+  bak_dir=$(mktemp -d ${dir}/backup.XXX);
+  echo "$0: Directory $dir already exists. Backing up diagonal UBM in ${bak_dir}";
+  for f in $dir/final.mat $dir/final.dubm $dir/online_cmvn.conf $dir/global_cmvn.stats; do
+    [ -f "$f" ] && mv $f ${bak_dir}/
+  done
+  [ -d "$dir/log" ] && mv $dir/log ${bak_dir}/
+fi
+
 splice_opts=$(cat $srcdir/splice_opts)
 cp $srcdir/splice_opts $dir/ || exit 1;
 cp $srcdir/final.mat $dir/ || exit 1;
@@ -146,10 +155,16 @@ for x in `seq 0 $[$num_iters-1]`; do
     $cmd $dir/log/update.$x.log \
       gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
       $dir/$[$x+1].dubm || exit 1;
-    rm $dir/$x.*.acc $dir/$x.dubm
+
+    if $cleanup; then
+      rm $dir/$x.*.acc $dir/$x.dubm
+    fi
   fi
 done
 
-rm $dir/gselect.*.gz
+if $cleanup; then
+  rm $dir/gselect.*.gz
+fi
+
 mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
 exit 0;
diff --git a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
index 67845b01c8a..5dbda1780f4 100755
--- a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
+++ b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
@@ -21,7 +21,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -32,8 +32,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -88,6 +88,17 @@ for f in $srcdir/final.dubm $srcdir/final.mat $srcdir/global_cmvn.stats $srcdir/
   [ ! -f $f ] && echo "No such file $f" && exit 1;
 done
 
+
+if [ -d "$dir" ]; then
+  bak_dir=$(mktemp -d ${dir}/backup.XXX);
+  echo "$0: Directory $dir already exists. Backing up iVector extractor in ${bak_dir}";
+  for f in $dir/final.ie $dir/*.ie $dir/final.mat $dir/final.dubm \
+        $dir/online_cmvn.conf $dir/global_cmvn.stats; do
+    [ -f "$f" ] &&  mv $f ${bak_dir}/
+  done
+  [ -d "$dir/log" ] && mv $dir/log ${bak_dir}/
+fi
+
 # Set various variables.
 mkdir -p $dir/log
 nj_full=$[$nj*$num_processes]
@@ -105,7 +116,6 @@ gmm_feats="ark,s,cs:apply-cmvn-online --config=$dir/online_cmvn.conf $dir/global
 feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
 
 
-
 # Initialize the i-vector extractor using the input GMM, which is converted to
 # full because that's what the i-vector extractor expects.  Note: we have to do
 # --use-weights=false to disable regression of the log weights on the ivector,
@@ -115,7 +125,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=false \
      "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -168,20 +178,23 @@ while [ $x -lt $num_iters ]; do
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
                                       #
-                                      # The parallel-opts was either specified by 
+                                      # The parallel-opts was either specified by
                                       # the user or we computed it correctly in
                                       # tge previous stages
 	$cmd --num-threads $[$num_threads*$num_processes] $dir/log/update.$x.log \
 	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
 	rm $dir/acc.$x.*
     if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
+      rm $dir/acc.$x $dir/$x.ie
     fi
   fi
   x=$[$x+1]
 done
 
+if $cleanup; then
+  rm $dir/post.*.gz
+fi
+
 rm $dir/final.ie 2>/dev/null
 ln -s $x.ie $dir/final.ie
 

From c1e7b292f1390c0cc9c5ddd9ee3fb4cbf79cca14 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 2 Apr 2017 15:25:20 -0400
Subject: [PATCH 206/213] [build,src,doc] Modify get_version.sh to deal better
 with whitespace (avoid space in version); minor fixes (#1526)

---
 src/base/get_version.sh     |  10 +--
 src/chain/chain-training.cc |   4 +-
 src/doc/transform.dox       | 172 ++++++++++++++++++------------------
 3 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/src/base/get_version.sh b/src/base/get_version.sh
index 4829391ac44..d6c6c975a4d 100755
--- a/src/base/get_version.sh
+++ b/src/base/get_version.sh
@@ -54,20 +54,20 @@ elif [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" != true ]; then
   echo "$0: Using the version number \"$version\" specified in src/.version."
 else
   # Figure out patch number.
-  version_commit=$(git log -1 --pretty=oneline ../.version | cut -f 1 -d ' ')
-  patch_number=$(git rev-list ${version_commit}..HEAD | wc -l)
+  version_commit=$(git log -1 --pretty=oneline ../.version | awk '{print $1}')
+  patch_number=$(git rev-list ${version_commit}..HEAD | wc -l | awk '{print $1}')
   version="$version.$patch_number"
 
   # Check for uncommitted changes in src/.
-  uncommitted_changes=$(git diff-index HEAD -- .. | wc -l)
+  uncommitted_changes=$(git diff-index HEAD -- .. | wc -l | awk '{print $1}')
   if [ $uncommitted_changes -gt 0 ]; then
     # Add suffix ~N if there are N files in src/ with uncommitted changes
     version="$version~$uncommitted_changes"
   fi
 
   # Figure out HEAD commit SHA-1.
-  head_commit=$(git log -1 --pretty=oneline | cut -f 1 -d ' ')
-  head_commit_short=$(git log -1 --oneline --abbrev=4 | cut -f 1 -d ' ')
+  head_commit=$(git log -1 --pretty=oneline | awk '{print $1}')
+  head_commit_short=$(git log -1 --oneline --abbrev=4 | awk '{print $1}')
   version="$version-${head_commit_short}"
 fi
 
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 1bf0201fbfa..53de69a0e07 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -30,7 +30,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
                               BaseFloat *objf,
-                              BaseFloat *l2_term,                              
+                              BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
                               CuMatrixBase<BaseFloat> *xent_output_deriv) {
@@ -86,7 +86,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
   // for different frames of the sequences.  As expected, they are
   // smaller towards the edges of the sequences (due to the penalization
   // of 'incorrect' pdf-ids.
-  if (GetVerboseLevel() >= 1) {
+  if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) {
     int32 tot_frames = nnet_output_deriv->NumRows(),
  frames_per_sequence = supervision.frames_per_sequence,
        num_sequences = supervision.num_sequences;
diff --git a/src/doc/transform.dox b/src/doc/transform.dox
index 6d487722124..dfeaf6f66d5 100644
--- a/src/doc/transform.dox
+++ b/src/doc/transform.dox
@@ -31,7 +31,7 @@ namespace kaldi {
   relate to the commonalities:
    - \ref transform_apply
    - \ref transform_perspk
-   - \ref transform_utt2spk 
+   - \ref transform_utt2spk
    - \ref transform_compose
    - \ref transform_weight
 
@@ -49,8 +49,8 @@ namespace kaldi {
 
   We next discuss regression class trees and transforms that use them:
     - \ref transform_regtree
-    
-    
+
+
   \section transform_apply Applying global linear or affine feature transforms
 
   In the case of feature-space transforms and projections that are global,
@@ -59,22 +59,22 @@ namespace kaldi {
   projection is represented as a matrix by which we will left-multiply a feature vector,
   so the transformed feature is \f$ A x \f$.  An affine transform or projection
   is represented the same way, but we imagine a 1 has been appended to the
-  feature vector, so the transformed feature is 
+  feature vector, so the transformed feature is
   \f$ W \left[ \begin{array}{c} x \\ 1 \end{array} \right] \f$ where
    \f$ W = \left[ A ; b \right] \f$, with A and b being the linear transform
   and the constant offset.
   Note that this convention differs from some of the literature, where the 1 may appear as
-  the first dimension rather than the last.  
+  the first dimension rather than the last.
   Global transforms and projections are generally written
   as a type Matrix<BaseFloat> to a single file, and speaker or utterance-specific
   transforms or projections are stored in a table of such matrices (see \ref io_sec_tables)
-  indexed by speaker-id or utterance-id.  
+  indexed by speaker-id or utterance-id.
 
   Transforms may be applied to features
   using the program transform-feats.  Its syntax is
 \verbatim
  transform-feats <transform> <input-feats> <output-feats>
-\endverbatim 
+\endverbatim
   where <input-feats> is an rspecifier, <output-feats> is an wspecifier, and <transform>
   may be an rxfilename or an rspecifier (see \ref io_sec_specifiers and \ref io_sec_xfilename).
   The program will work out whether the transform
@@ -83,14 +83,14 @@ namespace kaldi {
   This program is typically used as part of a pipe.
   A typical example is:
 \verbatim
- feats="ark:splice-feats scp:data/train.scp ark:- | 
+ feats="ark:splice-feats scp:data/train.scp ark:- |
           transform-feats $dir/0.mat ark:- ark:-|"
  some-program some-args "$feats" some-other-args ...
 \endverbatim
  Here, the file 0.mat contains a single matrix.  An example of applying
  speaker-specific transforms is:
 \verbatim
- feats="ark:add-deltas scp:data/train.scp ark:- | 
+ feats="ark:add-deltas scp:data/train.scp ark:- |
    transform-feats --utt2spk=ark:data/train.utt2spk ark:$dir/0.trans ark:- ark:-|"
  some-program some-args "$feats" some-other-args ...
 \endverbatim
@@ -98,33 +98,33 @@ A per-utterance example would be as above but removing the --utt2spk option.
 In this example, the archive file 0.trans would contain transforms (e.g. CMLLR transforms)
 indexed by speaker-id, and the file data/train.utt2spk would have
 lines of the form "utt-id spk-id" (see next section for more explanation).
-The program transform-feats does not care how the transformation matrix was 
+The program transform-feats does not care how the transformation matrix was
 estimated, it just applies it to the
 features.  After it has been through all the features it prints out the average
 per-frame log determinant.  This can be useful when comparing objective functions
 (this log determinant would have to be added to the per-frame likelihood printed
 out by programs like gmm-align, gmm-acc-stats, or gmm-decode-kaldi).  If the
 linear part A of the transformation (i.e. ignoring the offset term) is not square,
-then the program will instead print out the per-frame average of 
+then the program will instead print out the per-frame average of
 \f$ \frac{1}{2} \mathbf{logdet} (A A^T) \f$.  It refers to this as the pseudo-log-determinant.
-This is useful in checking convergence of MLLT estimation where the transformation matrix 
+This is useful in checking convergence of MLLT estimation where the transformation matrix
 being applied is the MLLT matrix times an LDA matrix.
 
 \section transform_perspk Speaker-independent versus per-speaker versus per-utterance adaptation
 
 Programs that estimate transforms are generally set up to do a particular kind of
 adaptation, i.e. speaker-independent versus (speaker- or utterance-specific).  For example, LDA
-and MLLT/STC transforms are speaker-independent but fMLLR transforms are speaker- or 
+and MLLT/STC transforms are speaker-independent but fMLLR transforms are speaker- or
 utterance-specific.  Programs that estimate speaker- or utterance-specific transforms
 will work in per-utterance mode by default, but in per-speaker mode if the --spk2utt
-option is supplied (see below).  
+option is supplied (see below).
 
 One program that can accept either speaker-independent or speaker- or utterance-specific
 transforms is transform-feats.  This program detects whether the first argument (the transform)
 is an rxfilename (see \ref io_sec_xfilename)
 or an rspecifier (see \ref io_sec_specifiers).  If the former, it treats it as a speaker-independent
 transform (e.g. a file containing a single matrix).
-If the latter, there are two choices.  If no --utt2spk option is provided, 
+If the latter, there are two choices.  If no --utt2spk option is provided,
 it treats the transform as a table of matrices indexed by utterance id.  If an --utt2spk option is provided
 (utt2spk is a table of strings indexed by utterance that contains the string-valued speaker id),
 then the transforms are assumed to be indexed by speaker id, and the table
@@ -133,13 +133,13 @@ provided to the --utt2spk option is used to map each utterance to a speaker id.
 \section transform_utt2spk Utterance-to-speaker and speaker-to-utterance maps
 
  At this point we give a general overview of the --utt2spk and --spk2utt options.
- These options are accepted by programs that deal with transformations; they are used when 
+ These options are accepted by programs that deal with transformations; they are used when
  you are doing per-speaker (as opposed to per-utterance) adaptation.
  Typically programs that process already-created transforms will need the --utt2spk
- option and programs that create the transforms will need the --spk2utt option. 
+ option and programs that create the transforms will need the --spk2utt option.
  A typical case is that there will be a file called some-directory/utt2spk
  that looks like:
-\verbatim 
+\verbatim
 spk1utt1  spk1
 spk1utt2  spk1
 spk2utt1  spk2
@@ -148,11 +148,11 @@ spk2utt2  spk2
 \endverbatim
 where these strings are just examples, they stand for generic speaker and
 utterance identifiers; and there will be a file called some-directory/spk2utt that looks like:
-\verbatim 
+\verbatim
 spk1 spk1utt1 spk1utt2
 spk2 spk2utt1 spk2utt2
 ...
-\endverbatim 
+\endverbatim
  and you will supply options that look like --utt2spk=ark:some-directory/utt2spk
  or --spk2utt=ark:some-directory/spk2utt.  The 'ark:' prefix is necessary because
  these files are given as rspecifiers by the Table code, and are interpreted as archives
@@ -177,7 +177,7 @@ spk2 spk2utt1 spk2utt2
  for more discussion of this issue.
 
  \section transform_compose Composing transforms
- 
+
  Another program that accepts generic transforms is the program compose-transforms.
  The general syntax is "compose-transforms a b c", and it performs the multiplication
  c = a b (although this involves a little more than matrix multiplication if a is affine).
@@ -197,7 +197,7 @@ spk2 spk2utt1 spk2utt2
  feats="ark:splice-feats scp:data/train.scp ark:- |
          transform-feats 0.mat ark:- ark:- |
          transform-feats ark:1.trans ark:- ark:- |"
- ...         
+ ...
 \endverbatim
  In general, the transforms a and b that are the inputs to compose-transforms
  may be either speaker-independent transforms or speaker- or utterance-specific
@@ -208,11 +208,11 @@ spk2 spk2utt1 spk2utt2
  represent either tables or normal files (i.e. either {r,w}specifiers or {r,w}xfilenames),
  subject to consistency requirements.
 
- If a is an affine transform, in order to perform the composition correctly, compose-transforms 
+ If a is an affine transform, in order to perform the composition correctly, compose-transforms
  needs to know whether b is affine or linear (it does not know this because it does not have access
  to the dimension of the features
  that are transformed by b).  This is controlled by the option --b-is-affine (bool, default false).
- If b is affine but you forget to set this option and a is affine, compose-transforms 
+ If b is affine but you forget to set this option and a is affine, compose-transforms
  will treat b as a linear transform from dimension (the real input feature dimension) plus one,
  and will output a transform whose input dimension is (the real input feature dimension) plus two.  There
  is no way for "transform-feats" to interpret this when it is to be applied to features,
@@ -225,7 +225,7 @@ Eliminating silence frames can be helpful when estimating speaker adaptive
 transforms such as CMLLR.  This even appears to be true when using
 a multi-class approach with a regression tree (for which, see \ref transform_regtree).
 The way we implement this is by weighting down the posteriors associated with
-silence phones.  This takes place as a modification to the \ref hmm_post 
+silence phones.  This takes place as a modification to the \ref hmm_post
 "state-level posteriors".  An extract of a bash shell script that does this
 is below (this script is discussed in more detail in \ref transform_cmllr_global):
 \verbatim
@@ -249,7 +249,7 @@ class LdaEstimate {
   void Accumulate(const VectorBase<BaseFloat> &data, int32 class_id,
                   BaseFloat weight=1.0);
 };
-\endverbatim 
+\endverbatim
 The program acc-lda accumulates LDA statistics using the acoustic states (i.e. pdf-ids) as the
 classes.  It requires the transition model in order to map the alignments (expressed in terms
 of transition-ids) to pdf-ids.  However, it is not limited to a particular type of acoustic model.
@@ -262,16 +262,16 @@ when using LDA as an initialization for HLDA.
 
 \section transform_splice Frame splicing
 
-Frame splicing (e.g. splicing nine consecutive frames together) is typically done 
+Frame splicing (e.g. splicing nine consecutive frames together) is typically done
 to the raw MFCC features prior to LDA.  The program splice-feats does this.  A typical
 line from a script that uses this is the following:
 \verbatim
 feats="ark:splice-feats scp:data/train.scp ark:- |
         transform-feats $dir/0.mat ark:- ark:-|"
 \endverbatim
-and the "feats" variable would later be used as an rspecifier (c.f. \ref io_sec_specifiers) 
+and the "feats" variable would later be used as an rspecifier (c.f. \ref io_sec_specifiers)
 by some program that needs to read features.  In this example we don't specify the number of frames to splice
-together because we are using the defaults (--left-context=4, --right-context=4, or 
+together because we are using the defaults (--left-context=4, --right-context=4, or
 9 frames in total).
 
 \section transform_delta Delta feature computation
@@ -279,7 +279,7 @@ together because we are using the defaults (--left-context=4, --right-context=4,
 Computation of delta features is done by the program add-deltas, which uses the
 function ComputeDeltas.  The delta feature computation has the same default setup
 as HTK's, i.e. to compute the first delta feature we multiply by the features
-by a sliding window of values [ -2, 1, 0, 1, 2 ], and then normalize by 
+by a sliding window of values [ -2, -1, 0, 1, 2 ], and then normalize by
 dividing by (2^2 + 1^2 + 0^2 + 1^2 + 2^2 = 10).  The second delta feature
 is computed by applying the same approach to the first delta feature.  The
 number of frames of context on each side is controlled by --delta-window (default: 2)
@@ -311,9 +311,9 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  case they need to be defined
  slightly differently for the accepted and rejected dimensions.
  Suppose the original feature dimension is D and the
- reduced feature dimension is K.  
+ reduced feature dimension is K.
  Let us forget the iteration superscript r, and use subscript j for state and
- m for Gaussian mixture. 
+ m for Gaussian mixture.
  For accepted dimensions (\f$0 \leq i < K\f$), the statistics are:
  \f[
    \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t) \frac{1}{ \sigma^2_{jm}(i) } (\mu_{jm} - \mathbf{x}(t)) (\mu_{jm} - \mathbf{x}(t))^T
@@ -333,13 +333,13 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  same, so in the code we only store statistics for K+1 rather than D dimensions.
 
  Also, it is convenient for the program that accumulates the statistics to only have
- access to the K-dimensional model, so during HLDA accumulation we accumulate 
+ access to the K-dimensional model, so during HLDA accumulation we accumulate
  statistics sufficient to estimate the K-dimensional means \f$\mu_{jm}\f$, and insead of
- G we accumulate the following statistics: for accepted dimensions (\f$0 \leq i < K\f$), 
+ G we accumulate the following statistics: for accepted dimensions (\f$0 \leq i < K\f$),
  \f[
    \mathbf{S}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t) \frac{1}{ \sigma^2_{jm}(i) }  \mathbf{x}(t) \mathbf{x}(t)^T
  \f]
- and for rejected dimensions \f$K \leq i < D\f$ 
+ and for rejected dimensions \f$K \leq i < D\f$
  \f[
    \mathbf{S}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t)  \mathbf{x}(t) \mathbf{x}(t)^T ,
  \f]
@@ -350,13 +350,13 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  \f]
  and for \f$K \leq i < D\f$,
  \f[
-  \mathbf{G}^{(i)} = \mathbf{S}^{(i)} - \beta \mu \mu^T, 
+  \mathbf{G}^{(i)} = \mathbf{S}^{(i)} - \beta \mu \mu^T,
  \f]
  where \f$ \beta = \sum_{j,m} \gamma_{jm} \f$ is the total count and \f$\mu = \frac{1}{\beta} \sum_{j,m} \mu_{j,m}\f$
  is the global feature mean.   After computing the transform from the G statistics using the same computation as MLLT,
  we output the transform, and we also use the first K rows of the transform to project the means
  into dimension K and write out the transformed model.
- 
+
  The computation described here is fairly slow; it is \f$ O(K^3) \f$ on each frame,
  and K is fairly large (e.g. 117).  This is the price we pay for compact statistics;
  if we stored full mean and variance statistics, the per-frame computation would be \f$O(K^2)\f$.
@@ -366,14 +366,14 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  the frames.  If this option is activated, we need to store two separate
  versions of the sufficient statistics for the means.  One version of the mean
  statistics, accumulated on the subset, is only used in the HLDA computation, and
- corresponds to the quantities \f$\gamma_{jm}\f$ and \f$\mu_{jm}\f$ in the formulas above. 
+ corresponds to the quantities \f$\gamma_{jm}\f$ and \f$\mu_{jm}\f$ in the formulas above.
  The other version of the mean statistics is accumulated on all the training data
- and is used to write out the transformed model.  
- 
+ and is used to write out the transformed model.
+
  The overall HLDA estimation process is as follows (see rm_recipe_2/scripts/train_tri2j.sh):
     - First initialize it with LDA (we store both the reduced dimension matrix
       and the full matrix).
-    - Start model-building and training process.  On certain (non-consecutive) 
+    - Start model-building and training process.  On certain (non-consecutive)
       iterations where we have decided to do the HLDA update, do the following:
       - Accumulate HLDA statistics (S, plus statistics for the full-dimensional means).
         The program that accumulates these (gmm-acc-hlda) needs the model, the un-transformed features,
@@ -384,14 +384,14 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
         transformation matrix which it needs to start the optimization and to correctly
         report auxiliary function changes.  It outputs the new transform (both full and
         reduced dimension), and the model with newly estimated and transformed means.
-   
+
  \section transform_mllt Global Semi-tied Covariance (STC) / Maximum Likelihood Linear Transform (MLLT) estimation
 
   Global STC/MLLT is a square feature-transformation matrix.  For more details,
-  see "Semi-tied Covariance Matrices for Hidden Markov Models", by Mark Gales, 
+  see "Semi-tied Covariance Matrices for Hidden Markov Models", by Mark Gales,
   IEEE Transactions on Speech and Audio Processing, vol. 7, 1999, pages 272-281.
   Viewing it as a feature-space transform, the objective function is the average
-  per-frame log-likelihood of the transformed features given the model, plus the 
+  per-frame log-likelihood of the transformed features given the model, plus the
   log determinant of the transform.  The means of the model are also rotated by
   transform in the update phase.  The sufficient statistics are the following,
   for \f$ 0 \leq i < D \f$ where D is the feature dimension:
@@ -399,9 +399,9 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
    \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t) \frac{1}{ \sigma^2_{jm}(i) } (\mu_{jm} - \mathbf{x}(t)) (\mu_{jm} - \mathbf{x}(t))^T
  \f]
   See the reference, Equations (22) and (23) for the update equations.  These are
-  basically a simplified form of the diagonal row-by-row Constrained MLLR/fMLLR update 
+  basically a simplified form of the diagonal row-by-row Constrained MLLR/fMLLR update
   equations, where the first-order term of the quadratic equation disappears.  Note that
-  our implementation differs from that reference by using a column of the inverse of the matrix 
+  our implementation differs from that reference by using a column of the inverse of the matrix
   rather than the cofactor, since multiplying by the determinant does not make a difference to the
   result and could potentially cause problems with floating-point underflow or overflow.
 
@@ -411,9 +411,9 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
 
   - Estimate the LDA transformation matrix (we only need the first rows of this, not the full matrix).
     Call this matrix \f$\mathbf{M}\f$.
-  - Start a normal model building process, always using features transformed with \f$\mathbf{M}\f$.  
+  - Start a normal model building process, always using features transformed with \f$\mathbf{M}\f$.
     At certain selected iterations (where we will update the MLLT matrix), we do the following:
-      - Accumulate MLLT statistics in the current fully-transformed space 
+      - Accumulate MLLT statistics in the current fully-transformed space
         (i.e., on top of features transformed with \f$\mathbf{M}\f$).  For efficiency we do this using
         a subset of the training data.
       - Do the MLLT update; let this produce a square matrix \f$\mathbf{T}\f$.
@@ -423,34 +423,34 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
   The programs involved in MLLT estimation are gmm-acc-mllt and est-mllt.  We also need the
   programs gmm-transform-means (to transform the Gaussian means using \f$\mathbf{T}\f$), and
   compose-transforms (to do the multiplication \f$\mathbf{M} \leftarrow \mathbf{T} \mathbf{M} \f$).
-   
+
 
  \section transform_cmllr_global Global CMLLR/fMLLR transforms
 
   Constrained Maximum Likelihood Linear Regression (CMLLR), also known as feature-space MLLR (fMLLR),
   is an affine feature transform of the form \f$ \mathbf{x} \rightarrow \mathbf{A} \mathbf{x}  + \mathbf{b} \f$,
-  which we write in the form  \f$ \mathbf{x} \rightarrow \mathbf{W} \mathbf{x}^+ \f$, where 
+  which we write in the form  \f$ \mathbf{x} \rightarrow \mathbf{W} \mathbf{x}^+ \f$, where
   \f$\mathbf{x}^+ = \left[\begin{array}{c} \mathbf{x} \\ 1 \end{array} \right]\f$ is the feature with
-  a 1 appended.  Note that this differs from some of the literature where the 1 comes first.  
+  a 1 appended.  Note that this differs from some of the literature where the 1 comes first.
 
   For a review paper that explains CMLLR and the estimation techniques we use, see
  "Maximum likelihood linear transformations for HMM-based speech recognition" by Mark Gales,
-  Computer Speech and Language Vol. 12, pages 75-98.  
+  Computer Speech and Language Vol. 12, pages 75-98.
 
   The sufficient statistics we store are:
   \f[ \mathbf{K} = \sum_{t,j,m} \gamma_{j,m}(t) \Sigma_{jm}^{-1} \mu_{jm} \mathbf{x}(t)^+ \f]
   where \f$\Sigma_{jm}^{-1}\f$ is the inverse covariance matrix,
   and for \f$0 \leq i < D \f$ where D is the feature dimension,
-  \f[ \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{j,m}(t) \frac{1}{\sigma^2_{j,m}(i)} \mathbf{x}(t)^+  \left.\mathbf{x}(t)^+\right.^T \f]   
+  \f[ \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{j,m}(t) \frac{1}{\sigma^2_{j,m}(i)} \mathbf{x}(t)^+  \left.\mathbf{x}(t)^+\right.^T \f]
 
   Our estimation scheme is the standard one, see Appendix B of the reference (in particular section B.1,
   "Direct method over rows").  We differ by using a column of the inverse in place of the cofactor row,
   i.e. ignoring the factor of the determinant, as it does not affect the result and causes danger of
   numerical underflow or overflow.
 
-  Estimation of global Constrained MLLR (CMLLR) transforms is done by the 
+  Estimation of global Constrained MLLR (CMLLR) transforms is done by the
   class FmllrDiagGmmAccs,
-  and by the program gmm-est-fmllr (also see gmm-est-fmllr-gpost).  The syntax 
+  and by the program gmm-est-fmllr (also see gmm-est-fmllr-gpost).  The syntax
   of gmm-est-fmllr is:
 \verbatim
 gmm-est-fmllr [options] <model-in> <feature-rspecifier> \
@@ -486,27 +486,27 @@ feats="ark:add-deltas --print-args=false scp:data/test.scp ark:- |
 gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
   --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
  "$feats" ark,t:$dir/test.tra ark,t:$dir/test.ali 2>$dir/decode.log
-\endverbatim                     
+\endverbatim
 
  \section transform_lvtln Linear VTLN (LVTLN)
 
  In recent years, there have been a number of papers that describe
  implementations of Vocal Tract Length Normalization (VTLN) that
- work out a linear feature transform corresponding to each VTLN 
+ work out a linear feature transform corresponding to each VTLN
  warp factor.  See, for example, ``Using VTLN for broadcast news transcription'',
  by D. Y. Kim, S. Umesh, M. J. F. Gales, T. Hain and P. C. Woodland, ICSLP 2004.
- 
+
  We implement a method in this general category using the class LinearVtln, and programs
  such as gmm-init-lvtln, gmm-train-lvtln-special, and gmm-est-lvtln-trans.
  The LinearVtln object essentially stores a set of linear feature transforms,
  one for each warp factor.  Let these linear feature transform matrices
  be
    \f[\mathbf{A}^{(i)},  0\leq i < N,  \f]
- where for instance we might have \f$N\f$=31, corresponding to 31 different warp 
- factors. We will describe below how we obtain these matrices below.  
+ where for instance we might have \f$N\f$=31, corresponding to 31 different warp
+ factors. We will describe below how we obtain these matrices below.
  The way the speaker-specific transform is estimated is as follows.
  First, we require some kind of model and a corresponding alignment.  In the
- example scripts we do this either with a small monophone model, or with 
+ example scripts we do this either with a small monophone model, or with
  a full triphone model.  From this model and alignment, and using the original,
  unwarped features, we compute the conventional statistics for estimating
  CMLLR.  When computing the LVTLN transform, what we do is take each matrix
@@ -514,33 +514,33 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  maximizes the CMLLR auxiliary function for the transform
   \f$\mathbf{W} = \left[  \mathbf{A}^{(i)} \, ; \, \mathbf{b} \right]\f$.
  This value of \f$\mathbf{W}\f$ that gave the best auxiliary function value
- (i.e. maximizing over i) becomes the transform for that speaker.  Since we 
+ (i.e. maximizing over i) becomes the transform for that speaker.  Since we
  are estimating a mean offset here,
  we are essentially combining a kind of model-based cepstral mean normalization
  (or alternatively an offset-only form of CMLLR) with VTLN warping implemented
- as a linear transform.  This avoids us having to implement mean normalization 
+ as a linear transform.  This avoids us having to implement mean normalization
  as a separate step.
 
  We next describe how we estimate the matrices \f$\mathbf{A}^{(i)}\f$.  We
  don't do this in the same way as described in the referenced paper; our method
  is simpler (and easier to justify).  Here we describe our computation for a
  particular warp factor; in the current scripts we have 31 distinct warp
- factors ranging from 0.85, 0.86, ..., 1.15.  
+ factors ranging from 0.85, 0.86, ..., 1.15.
  We take a subset of feature data (e.g. several tens of utterances),
  and for this subset we compute both the original and transformed features,
  where the transformed features are computed using a conventional VLTN computation
- (see \ref feat_vtln). 
- Call the original and transformed features \f$\mathbf{x}(t)\f$ and \f$\mathbf{y}(t)\f$ respectively, 
+ (see \ref feat_vtln).
+ Call the original and transformed features \f$\mathbf{x}(t)\f$ and \f$\mathbf{y}(t)\f$ respectively,
  where \f$t\f$ will range over the frames of the selected utterances.
  We compute the affine transform that maps \f$\mathbf{x}\f$ to \f$\mathbf{y}\f$ in a least-squares
- sense, i.e. if \f$\mathbf{y}' = \mathbf{A} \mathbf{x} + \mathbf{b}\f$, 
+ sense, i.e. if \f$\mathbf{y}' = \mathbf{A} \mathbf{x} + \mathbf{b}\f$,
  we compute \f$\mathbf{A}\f$ and \f$\mathbf{b}\f$ that minimizes the sum-of-squares
  difference \f$\sum_t (\mathbf{y}'(t) - \mathbf{y}(t) )^T (\mathbf{y}'(t) - \mathbf{y}(t) )\f$.
  Then we normalize the diagonal variance as follows: we compute the
  variance of the original features as \f$\mathbf{\Sigma}^{(x)}\f$ and of the linearly transformed
  features as \f$\mathbf{\Sigma}^{(y')}\f$, and for each dimension index d we multiply the
- d'th row of \f$\mathbf{A}\f$ by 
-  \f$\sqrt{ \frac{\mathbf{\Sigma}^{(x)}_{d,d}}{\mathbf{\Sigma}^{(y')}_{d,d}}}\f$.  
+ d'th row of \f$\mathbf{A}\f$ by
+  \f$\sqrt{ \frac{\mathbf{\Sigma}^{(x)}_{d,d}}{\mathbf{\Sigma}^{(y')}_{d,d}}}\f$.
  The resulting matrix will become \f$\mathbf{A}^{(i)}\f$ for some value of i.
 
  The command-line tools support the option to ignore the log determinant term
@@ -579,8 +579,8 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  are speaker-specific; other quantities (i.e. \f$\mathbf{A}\f$ and
  \f$\mathbf{B}\f$) are global and shared across all speakers.
 
- The most important factor in this equation is the middle one, 
- with the exponential function in it.  
+ The most important factor in this equation is the middle one,
+ with the exponential function in it.
  The factor \f$\mathbf{D}_s\f$ gives us the ability to combine
  model-based mean and optionally variance normalization (i.e. offset-only
  or diagonal-only CMLLR)
@@ -596,7 +596,7 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  there would be no point to this technique as the other quantities in the
  equation would add no degrees of freedom.  The tools support three kinds of
  constraints on \f$\mathbf{D}_s\f$: it may be of the form
- \f$[ {\mathbf I} \, \;\, {\mathbf 0} ]\f$ (no adaptation), or 
+ \f$[ {\mathbf I} \, \;\, {\mathbf 0} ]\f$ (no adaptation), or
  \f$[ {\mathbf I} \, \;\, {\mathbf m} ]\f$ (offset only), or
  \f$[ {\mathrm{diag}}( {\mathbf d} ) \, \;\, {\mathbf m} ]\f$ (diagonal CMLLR);
  this is controlled by the --normalize-type options to the command-line tools.
@@ -613,9 +613,9 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  if we were to warp by a factor f and then a factor g,
  this should be the same as warping by the combined factor
  fg.  Let l = log(f) and m = log(g).  Then we achieve this
- property via the identity 
+ property via the identity
   \f[ \exp( l \mathbf{A} ) \exp( m \mathbf{A}) = \exp( (l+m) \mathbf{A} ) . \f]
- 
+
  The ET computation for a particular speaker is as follows; this assumes we
  are given \f$\mathbf{A}\f$ and \f$\mathbf{B}\f$.  We accumulate conventional
  CMLLR sufficient statistics for the speaker.  In the update phase we iteratively optimize
@@ -636,9 +636,9 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  \f$\mathbf{B}\f$, or the model.
    - If updating \f$\mathbf{A}\f$, we do this given fixed values of
      \f$t_s\f$ and \f$\mathbf{D}_s\f$.  The update is not guaranteed to
-     converge, but converges rapidly in practice; it's based on a 
+     converge, but converges rapidly in practice; it's based on a
      quadratic "weak-sense auxiliary function"
-     where the quadratic term is obtained using a first-order truncation 
+     where the quadratic term is obtained using a first-order truncation
      of the Taylor series expansion of the matrix exponential function.
      After updating \f$\mathbf{A}\f$, we modify \f$\mathbf{B}\f$ in order
      to renormalize the \f$t_s\f$ to zero; this involves premultiplying
@@ -646,11 +646,11 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
      value of \f$t_s\f$.
 
    - If updating \f$\mathbf{B}\f$, this is also done using fixed values of
-     \f$t_s\f$ and \f$\mathbf{D}_s\f$, and the update is similar to MLLT 
+     \f$t_s\f$ and \f$\mathbf{D}_s\f$, and the update is similar to MLLT
      (a.k.a. global STC).
      For purposes of the accumulation and update, we imagine we are estimating
      an MLLT matrix just to the left of \f$\mathbf{A}\f$, i.e. some matrix
-     \f$\mathbf{C} \in \Re^{D\times D}\f$; let us define 
+     \f$\mathbf{C} \in \Re^{D\times D}\f$; let us define
      \f$\mathbf{C}^+ = \left[ \begin{array}{cc} \mathbf{C} & 0 \\ 0 & 1 \end{array} \right]\f$.
      The transform will be
      \f$\mathbf{W}_s = \mathbf{D}_s \mathbf{C}^+ \exp ( t_s \mathbf{A} ) \mathbf{B}\f$.
@@ -660,24 +660,24 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
      \f$\exp ( t_s \mathbf{A} ) \mathbf{B}\f$ as a feature-space transform (i.e.
      as part of the features).  After estimating \f$\mathbf{C}\f$, we will use the identity
 \f[
-   \mathbf{C}^+ \exp ( t_s \mathbf{A} ) =  \exp ( t_s \mathbf{C}^+ \mathbf{A}  \left.\mathbf{C}^+\right.^{-1} ) \mathbf{C}^+ 
+   \mathbf{C}^+ \exp ( t_s \mathbf{A} ) =  \exp ( t_s \mathbf{C}^+ \mathbf{A}  \left.\mathbf{C}^+\right.^{-1} ) \mathbf{C}^+
 \f]
   so the update becomes:
 \f[
         \mathbf{A} \leftarrow \mathbf{C}^+ \mathbf{A}  \left.\mathbf{C}^+\right.^{-1} , \ \ \mathbf{B} \leftarrow \mathbf{C}^+ \mathbf{B} .
 \f]
      At this point we need to transform the model means with the matrix
-     \f$\mathbf{C}\f$.  The reader might question how this interacts with the 
+     \f$\mathbf{C}\f$.  The reader might question how this interacts with the
      fact that for estimating \f$\mathbf{C}\f$, we viewed the quantity
      \f$\mathbf{D}_s\f$ as a model-space transform.  If \f$\mathbf{D}_s\f$ only
-     contains a mean offset, we can still prove that the auxiliary function 
+     contains a mean offset, we can still prove that the auxiliary function
      would increase, except we would have to change the offsets appropriately
      (this is not necessary to do explicitly, as we will re-estimate them on
-     the next iteration anyway).  However, if \f$\mathbf{D}_s\f$ has non-unit 
-     diagonal (i.e. is diagonal not offset CMLLR),  this re-estimation process 
-     is not guaranteed to improve the likelihood; the tools will print a warning 
+     the next iteration anyway).  However, if \f$\mathbf{D}_s\f$ has non-unit
+     diagonal (i.e. is diagonal not offset CMLLR),  this re-estimation process
+     is not guaranteed to improve the likelihood; the tools will print a warning
      in this case.  In order to avoid encountering this case, our scripts
-     train in a mode where \f$\mathbf{D}_s\f$ is an offset-only transform; but 
+     train in a mode where \f$\mathbf{D}_s\f$ is an offset-only transform; but
      in test time we allow \f$\mathbf{D}_s\f$ to be a diagonal CMLLR transform, which seems
      to give slightly better results than the offset-only case.
 
@@ -704,7 +704,7 @@ expanded features).  For very fast operation, it is possible to apply these
 approaches using a very tiny model with a phone-based language model, and some of
 our example scripts demonstrate this.  There is also the capability in the
 feature extraction code to subtract the mean on a per-utterance basis (the
---subtract-mean option to compute-mfcc-feats and compute-plp-feats).  
+--subtract-mean option to compute-mfcc-feats and compute-plp-feats).
 
 In order to support per-utterance and per-speaker mean and variance normalization
 we provide the programs compute-cmvn-stats and apply-cmvn.  The program

From 53d0e88f38646889f7793a71f4ab0046c6d39074 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Tue, 4 Apr 2017 19:31:30 +0200
Subject: [PATCH 207/213] [build]: remove openfst check (#1531)

It appears there may be no good reason to disallow system-wide OpenFst.
---
 tools/extras/check_dependencies.sh | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 3c26fd53e82..43579334c89 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -181,14 +181,6 @@ if which grep >&/dev/null && pwd | grep -E 'JOB|LMWT' >/dev/null; then
   status=1;
 fi
 
-if [ -f /usr/lib64/libfst.so.1 ] || [ -f /usr/local/include/fst.h ] || \
-   [ -f /usr/include/fst/fst.h ] || [ -f /usr/local/bin/fstinfo ]; then
-  echo "*** $0: Kaldi cannot be installed (for now) if you have OpenFst"
-  echo "***   installed in system space (version mismatches, etc.)"
-  echo "***   Please try to uninstall it."
-  status=1
-fi
-
 if ! $printed && [ $status -eq 0 ]; then
   echo "$0: all OK."
 fi

From 5ae74f107eeec02365853c4dac25d1e7ef886ab1 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Tue, 4 Apr 2017 13:10:40 -0500
Subject: [PATCH 208/213] [src] cudamatrix: speed up AddColSumMat with
 transfrom reduce kernel template (#1530)

CuVector::AddColSumMat<float>[no-trans],    16  0.0057  0.0172 3.01x
    CuVector::AddColSumMat<float>[no-trans],    32  0.0242  0.0668 2.76x
    CuVector::AddColSumMat<float>[no-trans],    64  0.0992  0.2577 2.60x
    CuVector::AddColSumMat<float>[no-trans],   128  0.3747  0.9280 2.48x
    CuVector::AddColSumMat<float>[no-trans],   256  1.4711  3.0541 2.08x
    CuVector::AddColSumMat<float>[no-trans],   512  5.1709  9.4713 1.83x
    CuVector::AddColSumMat<float>[no-trans],  1024 12.4352 20.4517 1.64x
    CuVector::AddColSumMat<double>[no-trans],   16  0.0060  0.0175 2.91x
    CuVector::AddColSumMat<double>[no-trans],   32  0.0240  0.0672 2.80x
    CuVector::AddColSumMat<double>[no-trans],   64  0.1006  0.2712 2.70x
    CuVector::AddColSumMat<double>[no-trans],  128  0.3691  0.9097 2.46x
    CuVector::AddColSumMat<double>[no-trans],  256  1.4530  3.1044 2.14x
    CuVector::AddColSumMat<double>[no-trans],  512  4.4524  7.5872 1.70x
    CuVector::AddColSumMat<double>[no-trans], 1024 11.1212 16.1423 1.45x
---
 src/cudamatrix/cu-kernels-ansi.h |  6 +++++
 src/cudamatrix/cu-kernels.cu     | 43 +++++++++++++++++++++++++++++++-
 src/cudamatrix/cu-kernels.h      | 10 ++++++++
 src/cudamatrix/cu-vector.cc      | 24 +++++++++++-------
 4 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index a69246a339a..444da38dd30 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -30,6 +30,12 @@
 #if HAVE_CUDA == 1
 extern "C" {
 
+void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta);
+void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta);
 void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index d4b247ffaa7..60800d9568d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1220,7 +1220,7 @@ static void _equal_element_mask(const Real *mat1, const Real *mat2, Real *mask,
 }
 
 enum EnumTransformReduce {
-  SUM, MAX, MIN, LINFNORM, L2NORM, L1NORM, L0NORM, LPNORM
+  SUMAB, SUM, MAX, MIN, LINFNORM, L2NORM, L1NORM, L0NORM, LPNORM
 };
 
 template<EnumTransformReduce TransReduceType, typename Real>
@@ -1243,6 +1243,35 @@ struct TransReduceOp {
   }
 };
 
+template<typename Real>
+struct TransReduceOp<SUMAB, Real> {
+  const Real alpha_;
+  const Real beta_;
+  TransReduceOp(const Real& a, const Real& b) :
+      alpha_(a), beta_(b) {
+  }
+  __forceinline__
+  __device__ Real InitValue() const {
+    return Real(0);
+  }
+  __forceinline__
+  __device__ Real Transform(const Real& x) const {
+    return x;
+  }
+  __forceinline__
+  __device__ Real Reduce(const Real& a, const Real& b) const {
+    return a + b;
+  }
+  __forceinline__
+  __device__ Real PostReduce(const Real& x, const Real& output) const {
+    if (beta_ == Real(0)) {
+      return alpha_ * x;
+    } else {
+      return alpha_ * x + beta_ * output;
+    }
+  }
+};
+
 template<typename Real>
 struct TransReduceOp<SUM, Real> {
   __forceinline__
@@ -3570,6 +3599,12 @@ void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,float>());
 }
+void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta) {
+  _transform_reduce_mat_cols<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, float>(alpha, beta));
+}
 
 void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
                          float changed) {
@@ -4225,6 +4260,12 @@ void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,double>());
 }
+void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta) {
+  _transform_reduce_mat_cols<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, double>(alpha, beta));
+}
 
 void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig,
                          double changed) {
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 87aaf096570..77352b5925f 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -38,6 +38,16 @@
 
 namespace kaldi {
 
+inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result,
+                                 const double* mat, const MatrixDim d,
+                                 const double alpha, const double beta) {
+  cudaD_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
+inline void cuda_add_col_sum_mat(int Gr, int Bl, float* result,
+                                 const float* mat, const MatrixDim d,
+                                 const float alpha, const float beta) {
+  cudaF_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
 inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                           int src_stride) {
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index e6aa72249f7..b825b9c0a6e 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -1173,19 +1173,25 @@ void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
 
 }
 
-
 template<typename Real>
-void CuVectorBase<Real>::AddColSumMat(Real alpha,
-                                      const CuMatrixBase<Real> &mat,
+void CuVectorBase<Real>::AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat,
                                       Real beta) {
-  KALDI_ASSERT(mat.NumRows() == Dim());
-
-  CuVector<Real> ones(mat.NumCols());
-  ones.Set(1.0);
-  this->AddMatVec(alpha, mat, kNoTrans, ones, beta);
-}
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    KALDI_ASSERT(mat.NumRows() == Dim());
 
+    cuda_add_col_sum_mat(mat.NumRows(), CU1DBLOCK, Data(), mat.Data(),
+                         mat.Dim(), alpha, beta);
+    CU_SAFE_CALL(cudaGetLastError());
 
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    Vec().AddColSumMat(alpha, mat.Mat(), beta);
+  }
+}
 
 template<typename Real>
 void CuVectorBase<Real>::InvertElements() {

From 5ac45be3887eff09df6f11a0f8392ec6de10987d Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 4 Apr 2017 17:52:17 -0400
Subject: [PATCH 209/213] [src] Cosmetic change: remove 'train.tra' from usage
 messages (#1529)

---
 src/bin/ali-to-phones.cc                      | 2 +-
 src/bin/align-equal.cc                        | 9 ++++++---
 src/bin/compile-train-graphs.cc               | 6 ++++--
 src/bin/phones-to-prons.cc                    | 3 ++-
 src/bin/prons-to-wordali.cc                   | 4 ++--
 src/gmmbin/gmm-align-compiled.cc              | 4 ++--
 src/gmmbin/gmm-align.cc                       | 6 ++++--
 src/latbin/lattice-best-path.cc               | 2 +-
 src/latbin/lattice-mbr-decode.cc              | 4 ++--
 src/latbin/linear-to-nbest.cc                 | 3 ++-
 src/latbin/nbest-to-linear.cc                 | 2 +-
 src/nnet2bin/nnet-align-compiled.cc           | 7 ++++---
 src/nnet3bin/nnet3-align-compiled.cc          | 4 ++--
 src/onlinebin/online-wav-gmm-decode-faster.cc | 2 +-
 14 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
index b370dbc7f18..2a76000cfae 100644
--- a/src/bin/ali-to-phones.cc
+++ b/src/bin/ali-to-phones.cc
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
         "Usage:  ali-to-phones  [options] <model> <alignments-rspecifier> "
         "<phone-transcript-wspecifier|ctm-wxfilename>\n"
         "e.g.: \n"
-        " ali-to-phones 1.mdl ark:1.ali ark:phones.tra\n"
+        " ali-to-phones 1.mdl ark:1.ali ark:-\n"
         "or:\n"
         " ali-to-phones --ctm-output 1.mdl ark:1.ali 1.ctm\n"
         "See also: show-alignments lattice-align-phones\n";
diff --git a/src/bin/align-equal.cc b/src/bin/align-equal.cc
index 3d35ee33daa..a3bc40dc236 100644
--- a/src/bin/align-equal.cc
+++ b/src/bin/align-equal.cc
@@ -36,10 +36,13 @@ int main(int argc, char *argv[]) {
     using fst::VectorFst;
     using fst::StdArc;
 
-    const char *usage =  "Write equally spaced alignments of utterances (to get training started)\n"
-        "Usage:  align-equal <tree-in> <model-in> <lexicon-fst-in> <features-rspecifier> <transcriptions-rspecifier> <alignments-wspecifier>\n"
+    const char *usage = "Write equally spaced alignments of utterances "
+        "(to get training started)\n"
+        "Usage:  align-equal <tree-in> <model-in> <lexicon-fst-in> "
+        "<features-rspecifier> <transcriptions-rspecifier> <alignments-wspecifier>\n"
         "e.g.: \n"
-        " align-equal 1.tree 1.mdl lex.fst scp:train.scp ark:train.tra ark:equal.ali\n";
+        " align-equal 1.tree 1.mdl lex.fst scp:train.scp "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:equal.ali\n";
 
     ParseOptions po(usage);
     std::string disambig_rxfilename;
diff --git a/src/bin/compile-train-graphs.cc b/src/bin/compile-train-graphs.cc
index 6636ef88878..874d079376e 100644
--- a/src/bin/compile-train-graphs.cc
+++ b/src/bin/compile-train-graphs.cc
@@ -37,9 +37,11 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Creates training graphs (without transition-probabilities, by default)\n"
         "\n"
-        "Usage:   compile-train-graphs [options] <tree-in> <model-in> <lexicon-fst-in> <transcriptions-rspecifier> <graphs-wspecifier>\n"
+        "Usage:   compile-train-graphs [options] <tree-in> <model-in> "
+        "<lexicon-fst-in> <transcriptions-rspecifier> <graphs-wspecifier>\n"
         "e.g.: \n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra ark:graphs.fsts\n";
+        " compile-train-graphs tree 1.mdl lex.fst "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:graphs.fsts\n";
     ParseOptions po(usage);
 
     TrainingGraphCompilerOptions gopts;
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 6e3cf7a4651..0d7ab12c232 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -80,7 +80,8 @@ int main(int argc, char *argv[]) {
         "<word-end-sym> <phones-rspecifier> <words-rspecifier> <prons-wspecifier>\n"
         "e.g.: \n"
         " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n"
-        "  phones-to-prons L_align.fst 46 47 ark:- 1.tra ark:1.prons\n";
+        "  phones-to-prons L_align.fst 46 47 ark:- "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:1.prons\n";
 
     ParseOptions po(usage);
     po.Read(argc, argv);
diff --git a/src/bin/prons-to-wordali.cc b/src/bin/prons-to-wordali.cc
index 8e89d7cc644..a6331043500 100644
--- a/src/bin/prons-to-wordali.cc
+++ b/src/bin/prons-to-wordali.cc
@@ -52,8 +52,8 @@ int main(int argc, char *argv[]) {
         " <phone-lengths-rspecifier> <wordali-wspecifier>\n"
         "e.g.: \n"
         " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n"
-        "  phones-to-prons L_align.fst 46 47 ark:- 1.tra ark:- | \\\n"
-        "  prons-to-wordali ark:- \\\n"
+        "  phones-to-prons L_align.fst 46 47 ark:- 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "  ark:- | prons-to-wordali ark:- \\\n"
         "    \"ark:ali-to-phones --write-lengths 1.mdl ark:1.ali ark:-|\" ark:1.wali\n";
     
     ParseOptions po(usage);
diff --git a/src/gmmbin/gmm-align-compiled.cc b/src/gmmbin/gmm-align-compiled.cc
index 85ac3fd27a7..c3aadcc7ec9 100644
--- a/src/gmmbin/gmm-align-compiled.cc
+++ b/src/gmmbin/gmm-align-compiled.cc
@@ -44,8 +44,8 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         " gmm-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
         "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra b, ark:- | \\\n"
-        "   gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
+        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "   ark:- | gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
 
     ParseOptions po(usage);
     AlignConfig align_config;
diff --git a/src/gmmbin/gmm-align.cc b/src/gmmbin/gmm-align.cc
index 7ef5f9c8dab..c9c2fde11f6 100644
--- a/src/gmmbin/gmm-align.cc
+++ b/src/gmmbin/gmm-align.cc
@@ -39,9 +39,11 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Align features given [GMM-based] models.\n"
-        "Usage:   gmm-align [options] tree-in model-in lexicon-fst-in feature-rspecifier transcriptions-rspecifier alignments-wspecifier\n"
+        "Usage:   gmm-align [options] tree-in model-in lexicon-fst-in feature-rspecifier "
+        "transcriptions-rspecifier alignments-wspecifier\n"
         "e.g.: \n"
-        " gmm-align tree 1.mdl lex.fst scp:train.scp ark:train.tra ark:1.ali\n";
+        " gmm-align tree 1.mdl lex.fst scp:train.scp "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:1.ali\n";
     ParseOptions po(usage);
     AlignConfig align_config;
     BaseFloat acoustic_scale = 1.0;
diff --git a/src/latbin/lattice-best-path.cc b/src/latbin/lattice-best-path.cc
index dc25fb351c6..ce9016d750c 100644
--- a/src/latbin/lattice-best-path.cc
+++ b/src/latbin/lattice-best-path.cc
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
         "Note: if you want output as FSTs, use lattice-1best; if you want output\n"
         "with acoustic and LM scores, use lattice-1best | nbest-to-linear\n"
         "Usage: lattice-best-path [options]  <lattice-rspecifier> [ <transcriptions-wspecifier> [ <alignments-wspecifier>] ]\n"
-        " e.g.: lattice-best-path --acoustic-scale=0.1 ark:1.lats ark:1.tra ark:1.ali\n";
+        " e.g.: lattice-best-path --acoustic-scale=0.1 ark:1.lats 'ark,t:|int2sym.pl -f 2- words.txt > text' ark:1.ali\n";
 
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
diff --git a/src/latbin/lattice-mbr-decode.cc b/src/latbin/lattice-mbr-decode.cc
index 465f4e35fbd..fba5daa4dd8 100644
--- a/src/latbin/lattice-mbr-decode.cc
+++ b/src/latbin/lattice-mbr-decode.cc
@@ -43,8 +43,8 @@ int main(int argc, char *argv[]) {
         "Usage: lattice-mbr-decode [options]  lattice-rspecifier "
         "transcriptions-wspecifier [ bayes-risk-wspecifier "
         "[ sausage-stats-wspecifier [ times-wspecifier] ] ] \n"
-        " e.g.: lattice-mbr-decode --acoustic-scale=0.1 ark:1.lats ark:1.tra "
-        "ark:/dev/null ark:1.sau\n";
+        " e.g.: lattice-mbr-decode --acoustic-scale=0.1 ark:1.lats "
+        "'ark,t:|int2sym.pl -f 2- words.txt > text' ark:/dev/null ark:1.sau\n";
     
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
diff --git a/src/latbin/linear-to-nbest.cc b/src/latbin/linear-to-nbest.cc
index fd025f382b6..a1864d0d14a 100644
--- a/src/latbin/linear-to-nbest.cc
+++ b/src/latbin/linear-to-nbest.cc
@@ -67,7 +67,8 @@ int main(int argc, char *argv[]) {
         "<nbest-wspecifier>\n"
         "Note: if the rspecifiers for lm-cost or ac-cost are the empty string,\n"
         "these value will default to zero.\n"
-        " e.g.: linear-to-nbest ark:1.ali ark:1.tra ark:1.lmscore ark:1.acscore "
+        " e.g.: linear-to-nbest ark:1.ali 'ark:sym2int.pl -f 2- words.txt text|' "
+        "ark:1.lmscore ark:1.acscore "
         "ark:1.nbest\n";
 
     ParseOptions po(usage);
diff --git a/src/latbin/nbest-to-linear.cc b/src/latbin/nbest-to-linear.cc
index 6b3fe5e1d01..d63c380133a 100644
--- a/src/latbin/nbest-to-linear.cc
+++ b/src/latbin/nbest-to-linear.cc
@@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
         "Usage: nbest-to-linear [options] <nbest-rspecifier> <alignments-wspecifier> "
         "[<transcriptions-wspecifier> [<lm-cost-wspecifier> [<ac-cost-wspecifier>]]]\n"
         " e.g.: lattice-to-nbest --n=10 ark:1.lats ark:- | \\\n"
-        "   nbest-to-linear ark:1.lats ark,t:1.ali ark,t:1.tra\n";
+        "   nbest-to-linear ark:1.lats ark,t:1.ali 'ark,t:|int2sym.pl -f 2- words.txt > text'\n";
 
     ParseOptions po(usage);
 
diff --git a/src/nnet2bin/nnet-align-compiled.cc b/src/nnet2bin/nnet-align-compiled.cc
index 60045eb7cce..8f5537c26c7 100644
--- a/src/nnet2bin/nnet-align-compiled.cc
+++ b/src/nnet2bin/nnet-align-compiled.cc
@@ -40,12 +40,13 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Align features given neural-net-based model\n"
-        "Usage:   nnet-align-compiled [options] model-in graphs-rspecifier feature-rspecifier alignments-wspecifier\n"
+        "Usage:   nnet-align-compiled [options] model-in graphs-rspecifier "
+        "feature-rspecifier alignments-wspecifier\n"
         "e.g.: \n"
         " nnet-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
         "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra b, ark:- | \\\n"
-        "   nnet-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
+        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "   ark:- | nnet-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
 
     ParseOptions po(usage);
     AlignConfig align_config;
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index bab5d16f370..84a5f38b4ee 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -47,8 +47,8 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         " nnet3-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
         "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra b, ark:- | \\\n"
-        "   nnet3-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
+        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "   ark:- | nnet3-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
 
     ParseOptions po(usage);
     AlignConfig align_config;
diff --git a/src/onlinebin/online-wav-gmm-decode-faster.cc b/src/onlinebin/online-wav-gmm-decode-faster.cc
index e5d54b80db5..fe7c6d6b974 100644
--- a/src/onlinebin/online-wav-gmm-decode-faster.cc
+++ b/src/onlinebin/online-wav-gmm-decode-faster.cc
@@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Reads in wav file(s) and simulates online decoding.\n"
-        "Writes .tra and .ali files for WER computation. Utterance "
+        "Writes integerized-text and .ali files for WER computation. Utterance "
         "segmentation is done on-the-fly.\n"
         "Feature splicing/LDA transform is used, if the optional(last) argument "
         "is given.\n"

From 57658190a1c9603559b4f241b9524c1ccc808575 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Wed, 5 Apr 2017 19:26:41 +0200
Subject: [PATCH 210/213] [src] nnet1: improving the GPU diagnostics, (#1532)

- we auto-detect the 'compute capability' problems (these appear as the 'invalid device function'),
- we also provide guidelines what to try before posting to forum, and which info to send to us,
---
 src/nnetbin/cuda-gpu-available.cc | 74 ++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 26 deletions(-)

diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnetbin/cuda-gpu-available.cc
index 897f01a8241..89fd26be86f 100644
--- a/src/nnetbin/cuda-gpu-available.cc
+++ b/src/nnetbin/cuda-gpu-available.cc
@@ -24,9 +24,21 @@
 
 #include "base/kaldi-common.h"
 #include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-matrix.h"
 
 using namespace kaldi;
 
+#if HAVE_CUDA == 1
+/**
+ * With incorrect CUDA setup, this will trigger "invalid device function" error.
+ */
+void TestGpuComputation() {
+  CuMatrix<BaseFloat> m(100,100);
+  m.SetRandn();
+  m.ApplySoftMaxPerRow(m);
+}
+#endif
+
 int main(int argc, char *argv[]) try {
   char hostname[100] = "UNKNOWN-HOSTNAME";
 #ifndef _MSC_VER
@@ -34,14 +46,33 @@ int main(int argc, char *argv[]) try {
     KALDI_WARN << "Cannot get hostname, " << strerror(errno);
   }
 #endif
-  std::cerr
-    << "### IS CUDA GPU AVAILABLE? '"
-    << hostname << "' ###" << std::endl;
+  KALDI_LOG << std::endl << std::endl
+    << "### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().SelectGpuId("yes");
-  std::cerr
-    << "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ###"
-    << std::endl;
+  fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
+  fprintf(stderr, "### Testing CUDA setup with a small computation "
+                  "(setup = cuda-toolkit + gpu-driver + kaldi):\n");
+  // the test of setup by computation,
+  try {
+    TestGpuComputation();
+  } catch (const std::exception &e) {
+    fprintf(stderr, "%s\n", e.what());
+    KALDI_LOG << "...\n"
+      << "### The CUDA setup is wrong! "
+      << "(\"invalid device function\" == problem with 'compute capability' "
+      << "in compiled kaldi)\n"
+      << "### Before posting the error to forum, please try following:\n"
+      << "### 1) update kaldi & cuda-toolkit (& GPU driver),\n"
+      << "### 2) re-run 'src/configure',\n"
+      << "### 3) re-compile kaldi by 'make clean; make -j depend; make -j'\n"
+      << "###\n"
+      << "### If the problem persists, please send us your:\n"
+      << "### - GPU model name, cuda-toolkit version, driver version "
+      << "(run nvidia-smi), variable $(CUDA_ARCH) from src/kaldi.mk";
+    return -1;
+  }
+  fprintf(stderr, "### Test OK!\n");
   return 0;
 #else
   std::cerr
@@ -51,26 +82,17 @@ int main(int argc, char *argv[]) try {
   return 1;
 #endif
 } catch (const std::exception &e) {
-  std::cerr << e.what();
-  std::cerr
-    << "### WE DID NOT GET A CUDA GPU!!! ###" << std::endl
-    << "### If it's your 1st experiment with CUDA, try reinstalling "
-    << "'CUDA toolkit' from NVidia web (it contains the drivers)."
-    << std::endl
-    << "### In other cases run 'nvidia-smi' in terminal "
-    << "(gets installed with display drivers) :"
-    << std::endl
-    << "### - Check that you see your GPU."
-    << std::endl
-    << "### - Bad GPUs are reporting error or disappear from the list "
-    << "until reboot."
-    << std::endl
-    << "### - Check 'Memory-Usage' and 'GPU fan', "
-    << "which will tell you if the GPU was taken by other process."
-    << std::endl
-    << "### - Check there is same version of 'NVIDIA-SMI' and "
-    << "'Driver', and that it is not too old for your GPU."
-    << std::endl;
+  fprintf(stderr, "%s\n", e.what());
+  KALDI_LOG << "...\n"
+    << "### WE DID NOT GET A CUDA GPU!!! ###\n"
+    << "### If your system has a 'free' CUDA GPU, try re-installing "
+    << "latest 'CUDA toolkit' from NVidia (this updates GPU drivers too).\n"
+    << "### Otherwise 'nvidia-smi' shows the status of GPUs:\n"
+    << "### - The versions should match ('NVIDIA-SMI' and 'Driver Version'), "
+    << "otherwise reboot or reload kernel module,\n"
+    << "### - The GPU should be unused "
+    << "(no 'process' in list, low 'memory-usage' (<100MB), low 'gpu-fan' (<30%)),\n"
+    << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
   return -1;
 }
 

From 19df56a2295fb1d7ff531afc8069566fbce3d8fa Mon Sep 17 00:00:00 2001
From: Peter Smit <peter@smitmail.eu>
Date: Tue, 11 Apr 2017 20:40:51 +0200
Subject: [PATCH 211/213] [src] Fix copy-feats for using the --write-num-frames
 and --compress true flags at the same time (#1541)

---
 src/featbin/copy-feats.cc | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/featbin/copy-feats.cc b/src/featbin/copy-feats.cc
index 0fbcca6399a..f1f58653f2f 100644
--- a/src/featbin/copy-feats.cc
+++ b/src/featbin/copy-feats.cc
@@ -102,19 +102,31 @@ int main(int argc, char *argv[]) {
         CompressedMatrixWriter kaldi_writer(wspecifier);
         if (htk_in) {
           SequentialTableReader<HtkMatrixHolder> htk_reader(rspecifier);
-          for (; !htk_reader.Done(); htk_reader.Next(), num_done++)
+          for (; !htk_reader.Done(); htk_reader.Next(), num_done++) {
             kaldi_writer.Write(htk_reader.Key(),
                                CompressedMatrix(htk_reader.Value().first));
+            if (!num_frames_wspecifier.empty())
+              num_frames_writer.Write(htk_reader.Key(),
+                                      htk_reader.Value().first.NumRows());
+          }
         } else if (sphinx_in) {
           SequentialTableReader<SphinxMatrixHolder<> > sphinx_reader(rspecifier);
-          for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++)
+          for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++) {
             kaldi_writer.Write(sphinx_reader.Key(),
                                CompressedMatrix(sphinx_reader.Value()));
+            if (!num_frames_wspecifier.empty())
+              num_frames_writer.Write(sphinx_reader.Key(),
+                                      sphinx_reader.Value().NumRows());
+          }
         } else {
           SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
-          for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++)
+          for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++) {
             kaldi_writer.Write(kaldi_reader.Key(),
                                CompressedMatrix(kaldi_reader.Value()));
+            if (!num_frames_wspecifier.empty())
+              num_frames_writer.Write(kaldi_reader.Key(),
+                                      kaldi_reader.Value().NumRows());
+          }
         }
       }
       KALDI_LOG << "Copied " << num_done << " feature matrices.";

From b1e6ec819a7c9beba256fa6934b52d6ae1e3a0be Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 12 Apr 2017 14:06:19 -0400
Subject: [PATCH 212/213] [scripts] fix to get_egs_targets.sh (thanks: David
 Pye)

---
 egs/wsj/s5/steps/nnet3/get_egs_targets.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index a2749b48fac..b8fcbfd51fa 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -217,7 +217,7 @@ if [ $num_archives -eq 1 ]; then
   echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
   echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
   sleep 4
-done
+fi
 
 # We may have to first create a smaller number of larger archives, with number
 # $num_archives_intermediate, if $num_archives is more than the maximum number

From 6599c9b945b5af9cc8e4817edde7b017bcccb8af Mon Sep 17 00:00:00 2001
From: Vijayaditya Peddinti <vijayaditya@users.noreply.github.com>
Date: Wed, 12 Apr 2017 11:31:10 -0700
Subject: [PATCH 213/213] [scripts] nnet3 : fix issue where LDA estimation
 failed for LSTMs with label delay (#1540)

xconfig : Added delay option for FixedAffineLayer. This will be used for ensuring the model specified in ref.config has at least the context required by the model specified in init.config
---
 .../s5c/local/chain/tuning/run_lstm_6k.sh     | 304 ++++++++++++++++++
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  22 +-
 egs/wsj/s5/steps/nnet3/xconfig_to_configs.py  |  41 +++
 3 files changed, 361 insertions(+), 6 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
new file mode 100755
index 00000000000..b9b7152dcbe
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
@@ -0,0 +1,304 @@
+#!/bin/bash
+
+# Copyright 2015  Johns Hopkins University (Author: Daniel Povey).
+#           2015  Vijayaditya Peddinti
+#           2015  Xingyu Na
+#           2015  Pegah Ghahrmani
+#           2017  Google  Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
+
+
+# run_lstm_6k.sh is like run_lstm_6j.sh but making
+# various kaldi-5.1-related upgrades to the script.
+# For the list of changes compare tuning/run_tdnn_lstm_1{c,d}.sh
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_6k # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  # Note : The delay variable will be used just in the init.config.
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat delay=$label_delay
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index de4c4af9df8..59b6006accb 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -1,5 +1,6 @@
 # Copyright 2016    Johns Hopkins University (Dan Povey)
 #           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
 # Apache 2.0.
 
 """ This module contains the parent class from which all layers are inherited
@@ -775,7 +776,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
 #   input='[-1]'             [Descriptor giving the input of the layer.]
 #   dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
 #   affine-transform-file='' [Must be specified.]
-#
+#   delay=0                  [Optional delay for the output-node in init.config]
 class XconfigFixedAffineLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == 'fixed-affine-layer'
@@ -787,6 +788,7 @@ def set_default_configs(self):
         self.config = { 'input':'[-1]',
                         'dim':-1,
                         'affine-transform-file':'',
+                        'delay':0,
                         'write-init-config':True}
 
     def check_configs(self):
@@ -819,11 +821,19 @@ def get_full_config(self):
         transform_file = self.config['affine-transform-file']
 
         if self.config['write-init-config']:
-            # to init.config we write an output-node with the name 'output' and
-            # with a Descriptor equal to the descriptor that's the input to this
-            # layer.  This will be used to accumulate stats to learn the LDA transform.
-            line = 'output-node name=output input={0}'.format(descriptor_final_string)
-            ans.append(('init', line))
+            if self.config['delay'] != 0:
+                line = 'component name={0}.delayed type=NoOpComponent dim={1}'.format(self.name, input_dim)
+                ans.append(('init', line))
+                line = 'component-node name={0}.delayed component={0}.delayed input={1}'.format(self.name, descriptor_final_string)
+                ans.append(('init', line))
+                line = 'output-node name=output input=Offset({0}.delayed, {1})'.format(self.name, self.config['delay'])
+                ans.append(('init', line))
+            else:
+                # to init.config we write an output-node with the name 'output' and
+                # with a Descriptor equal to the descriptor that's the input to this
+                # layer.  This will be used to accumulate stats to learn the LDA transform.
+                line = 'output-node name=output input={0}'.format(descriptor_final_string)
+                ans.append(('init', line))
 
         # write the 'real' component to final.config
         line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index 5184b6eed41..7e876bda1ed 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -1,5 +1,10 @@
 #!/usr/bin/env python
 
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import argparse
@@ -236,6 +241,41 @@ def add_back_compatibility_info(config_dir):
     common_lib.force_symlink("final.config".format(config_dir),
                              "{0}/layer1.config".format(config_dir))
 
+def check_model_contexts(config_dir):
+    contexts = {}
+    for file_name in ['init', 'ref']:
+        if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)):
+            contexts[file_name] = {}
+            common_lib.run_kaldi_command("nnet3-init {0}/{1}.config "
+                                         "{0}/{1}.raw".format(config_dir, file_name))
+            out, err = common_lib.run_kaldi_command("nnet3-info {0}/{1}.raw | "
+                                                    "head -4".format(config_dir, file_name))
+            # out looks like this
+            # left-context: 7
+            # right-context: 0
+            # num-parameters: 90543902
+            # modulus: 1
+            for line in out.split("\n"):
+                parts = line.split(":")
+                if len(parts) != 2:
+                    continue
+                key = parts[0].strip()
+                value = int(parts[1].strip())
+                if key in ['left-context', 'right-context']:
+                    contexts[file_name][key] = value
+
+    if contexts.has_key('init'):
+        assert(contexts.has_key('ref'))
+        if ((contexts['init']['left-context'] > contexts['ref']['left-context'])
+           or (contexts['init']['right-context'] > contexts['ref']['right-context'])):
+           raise Exception("Model specified in {0}/init.config requires greater"
+                           " context than the model specified in {0}/ref.config."
+                           " This might be due to use of label-delay at the output"
+                           " in ref.config. Please use delay=$label_delay in the"
+                           " initial fixed-affine-layer of the network, to avoid"
+                           " this issue.")
+
+
 
 def main():
     args = get_args()
@@ -243,6 +283,7 @@ def main():
     all_layers = xparser.read_xconfig_file(args.xconfig_file)
     write_expanded_xconfig_files(args.config_dir, all_layers)
     write_config_files(args.config_dir, all_layers)
+    check_model_contexts(args.config_dir)
     add_back_compatibility_info(args.config_dir)