vimalmanohar · vimalmanohar · Nov 23, 2016 · Nov 23, 2016 · Nov 24, 2016 · Nov 24, 2016
diff --git a/egs/aspire/s5/conf/mfcc_hires_bp.conf b/egs/aspire/s5/conf/mfcc_hires_bp.conf
@@ -0,0 +1,15 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+# This config is defined only on the frequencies from 330 Hz to 
+# 3000 Hz conrresponding to the telephone bandwidth.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=28
+--num-ceps=28
+--cepstral-lifter=0
+--low-freq=330    # low cutoff frequency for mel bins
+--high-freq=-1000 # high cutoff frequently, relative to Nyquist of 4000 (=3000)
+
+
diff --git a/egs/aspire/s5/conf/segmentation_music.conf b/egs/aspire/s5/conf/segmentation_music.conf
@@ -0,0 +1,14 @@
+# General segmentation options for segmentation on music / non-music
+pad_length=-1          # Pad speech segments by this many frames on either side
+max_blend_length=-1  # Maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby.
+max_intersegment_length=0  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+post_pad_length=-1         # Pad speech segments by this many frames on either side
+                          # after the merging process using max_intersegment_length
+max_segment_length=1000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=250        # Overlapping frames when segments are split.
+                          # See the above option.
+min_silence_length=100000     # Min silence length at which to split very long segments
diff --git a/egs/aspire/s5/conf/segmentation_speech.conf b/egs/aspire/s5/conf/segmentation_speech.conf
@@ -0,0 +1,14 @@
+# General segmentation options for SAD
+pad_length=20          # Pad speech segments by this many frames on either side
+max_relabel_length=10  # Maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby.
+max_intersegment_length=30  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+post_pad_length=10        # Pad speech segments by this many frames on either side
+                          # after the merging process using max_intersegment_length
+max_segment_length=1000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=250        # Overlapping frames when segments are split.
+                          # See the above option.
+min_silence_length=20     # Min silence length at which to split very long segments
diff --git a/egs/aspire/s5/conf/segmentation_speech_simple.conf b/egs/aspire/s5/conf/segmentation_speech_simple.conf
@@ -0,0 +1,15 @@
+# A simple segmentation post-processing options for SAD 
+pad_length=20          # Pad speech segments by this many frames on either side
+max_relabel_length=-1  # Maximum duration of speech that will be removed as part
+                       # of smoothing process. This is only if there are no other
+                       # speech segments nearby. -1 is to disable this step.
+max_intersegment_length=30  # Merge nearby speech segments if the silence
+                            # between them is less than this many frames.
+post_pad_length=-1        # Pad speech segments by this many frames on either side
+                          # after the merging process using max_intersegment_length
+                          # -1 is to disable this step.
+max_segment_length=1000   # Segments that are longer than this are split into
+                          # overlapping frames.
+overlap_length=250        # Overlapping frames when segments are split.
+                          # See the above option.
+min_silence_length=20     # Min silence length at which to split very long segments
diff --git a/egs/aspire/s5/local/multi_condition/get_ctm.sh b/egs/aspire/s5/local/multi_condition/get_ctm.sh
@@ -7,8 +7,7 @@ decode_mbr=true
 filter_ctm_command=cp
 glm=
 stm=
-window=10
-overlap=5
+resolve_overlaps=true
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
@@ -62,7 +61,13 @@ lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true --ma
 lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping || exit 1;
 
 # combine the segment-wise ctm files, while resolving overlaps
-python local/multi_condition/resolve_ctm_overlaps.py --overlap $overlap --window-length $window $data_dir/utt2spk $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping $decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1;
+if $resolve_overlaps; then
+  steps/resolve_ctm_overlaps.py $data_dir/segments \
+    $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping \
+    $decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1;
+else
+  cp $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping $decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1;
+fi
 merged_ctm=$decode_dir/score_$LMWT/penalty_$wip/ctm.merged
 
 cat $merged_ctm | utils/int2sym.pl -f 5 $lang/words.txt | \

diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh b/egs/aspire/s5/local/nnet3/prep_test_aspire_segmentation.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+
+# Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016.  Apache 2.0.
+# This script generates the ctm files for dev_aspire, test_aspire and eval_aspire
+# for scoring with ASpIRE scoring server.
+# It also provides the WER for dev_aspire data.
+
+set -e
+set -o pipefail 
+set -u
+
+# general opts
+iter=final
+stage=0
+decode_num_jobs=30
+num_jobs=30
+affix=
+
+# ivector opts
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=6000
+ivector_scale=0.75
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+
+# decode opts
+pass2_decode_opts="--min-active 1000"
+lattice_beam=8
+extra_left_context=0 # change for (B)LSTM
+extra_right_context=0 # change for BLSTM
+frames_per_chunk=50 # change for (B)LSTM
+acwt=0.1 # important to change this when using chain models
+post_decode_acwt=1.0 # important to change this when using chain models
+
+. ./cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 5 ]; then
+  echo "Usage: $0 [options] <data-set> <seg-data-dir> <lang-dir> <graph-dir> <model-dir>"
+  echo " Options:"
+  echo "    --stage (0|1|2)   # start scoring script from part-way through."
+  echo "e.g.:"
+  echo "$0 dev_aspire data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
+  exit 1;
+fi
+
+data_set=$1 
+seg_data_dir=$2
+lang=$3 # data/lang
+graph=$4 #exp/tri5a/graph_pp
+dir=$5 # exp/nnet3/tdnn
+
+model_affix=`basename $dir`
+ivector_dir=exp/nnet3
+ivector_affix=${affix:+_$affix}_chain_${model_affix}_iter$iter
+affix=_${affix}_iter${iter}
+act_data_set=${data_set} # we will modify the data dir, when segmenting it
+                         # so we will keep track of original data dirfor the glm and stm files
+
+if [[ "$data_set" =~ "test_aspire" ]]; then
+  out_file=single_dev_test${affix}_$model_affix.ctm
+elif [[ "$data_set" =~ "eval_aspire" ]]; then
+  out_file=single_eval${affix}_$model_affix.ctm
+elif [[ "$data_set" =~  "dev_aspire" ]]; then
+  # we will just decode the directory without oracle segments file
+  # as we would like to operate in the actual evaluation condition
+  out_file=single_dev${affix}_${model_affix}.ctm
+else 
+  exit 1
+fi
+
+# uniform segmentation script would have created this dataset
+# so update that script if you plan to change this variable
+segmented_data_set=${data_set}${affix}_seg
+
+if [ $stage -le 1 ]; then
+  utils/copy_data_dir.sh $seg_data_dir data/${segmented_data_set}
+fi
+
+if [ $stage -le 2 ]; then
+  mfccdir=mfcc_reverb
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/aspire-$date/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  utils/copy_data_dir.sh data/${segmented_data_set} data/${segmented_data_set}_hires
+  steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" \
+    --mfcc-config conf/mfcc_hires.conf data/${segmented_data_set}_hires \
+    exp/make_reverb_hires/${segmented_data_set} $mfccdir
+  steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires \
+    exp/make_reverb_hires/${segmented_data_set} $mfccdir
+  utils/fix_data_dir.sh data/${segmented_data_set}_hires
+  utils/validate_data_dir.sh --no-text data/${segmented_data_set}_hires
+fi
+
+decode_dir=$dir/decode_${segmented_data_set}_pp
+if [ $stage -le 5 ]; then
+  echo "Extracting i-vectors, stage 2"
+  # this does offline decoding, except we estimate the iVectors per
+  # speaker, excluding silence (based on alignments from a DNN decoding), with a
+  # different script.  This is just to demonstrate that script.
+  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+  # up into "sub-speakers" of at least that many frames... can be useful if
+  # acoustic conditions drift over time within the speaker's data.
+  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \
+    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+    data/${segmented_data_set}_hires $lang $ivector_dir/extractor \
+    $ivector_dir/ivectors_${segmented_data_set}${ivector_affix};
+fi
+
+if [ $stage -le 6 ]; then
+  echo "Generating lattices, stage 2 with --acwt $acwt"
+  rm -f ${decode_dir}_tg/.error
+  steps/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config $pass2_decode_opts \
+      --acwt $acwt --post-decode-acwt $post_decode_acwt \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --frames-per-chunk "$frames_per_chunk" \
+      --skip-scoring true --iter $iter --lattice-beam $lattice_beam \
+      --online-ivector-dir $ivector_dir/ivectors_${segmented_data_set}${ivector_affix} \
+     $graph data/${segmented_data_set}_hires ${decode_dir}_tg || touch ${decode_dir}_tg/.error
+  [ -f ${decode_dir}_tg/.error ] && echo "$0: Error decoding" && exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  echo "Rescoring lattices"
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    --skip-scoring true \
+    ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \
+    ${decode_dir}_{tg,fg};
+fi
+
+decode_dir=${decode_dir}_fg
+
+if [ $stage -le 8 ]; then
+  local/score_aspire.sh --cmd "$decode_cmd" \
+    --min-lmwt 1 --max-lmwt 20 \
+    --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \
+    --ctm-beam 6 \
+    --iter $iter \
+    --decode-mbr true \
+    --resolve-overlaps false \
+    --tune-hyper true \
+    $lang $decode_dir $act_data_set $segmented_data_set $out_file
+fi
+
+# Two-pass decoding baseline
+# %WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys
+# Using automatic segmentation 
+# %WER 28.2 | 2120 27214 | 76.5 12.4 11.1 4.7 28.2 75.2 | -0.522 | exp/chain/tdnn_7b/decode_dev_aspire_seg_v7_n_stddev_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys
diff --git a/egs/aspire/s5/local/score_aspire.sh b/egs/aspire/s5/local/score_aspire.sh
@@ -14,10 +14,9 @@ word_ins_penalties=0.0,0.25,0.5,0.75,1.0
 default_wip=0.0
 ctm_beam=6
 decode_mbr=true
-window=30
-overlap=5
 cmd=run.pl
 stage=1
+resolve_overlaps=true
 tune_hyper=true # if true:
                 #    if the data set is "dev_aspire" we check for the
                 #       best lmwt and word_insertion_penalty,
@@ -89,7 +88,7 @@ if  $tune_hyper ; then
   # or use the default values
 
   if [ $stage -le 1 ]; then
-    if [ "$act_data_set" == "dev_aspire" ]; then
+    if [[ "$act_data_set" =~ "dev_aspire" ]]; then
       wip_string=$(echo $word_ins_penalties | sed 's/,/ /g')
       temp_wips=($wip_string)
       $cmd WIP=1:${#temp_wips[@]} $decode_dir/scoring/log/score.wip.WIP.log \
@@ -98,8 +97,8 @@ if  $tune_hyper ; then
         echo \$wip \&\& \
         $cmd LMWT=$min_lmwt:$max_lmwt $decode_dir/scoring/log/score.LMWT.\$wip.log \
           local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \
-            --window $window --overlap $overlap \
             --beam $ctm_beam --decode-mbr $decode_mbr \
+            --resolve-overlaps $resolve_overlaps \
             --glm data/${act_data_set}/glm --stm data/${act_data_set}/stm \
           LMWT \$wip $lang data/${segmented_data_set}_hires $model $decode_dir || exit 1;
 
@@ -124,7 +123,7 @@ wipfile.close()
   fi
 
 
-  if [ "$act_data_set" == "test_aspire" ] || [ "$act_data_set" == "eval_aspire" ]; then
+  if [[ "$act_data_set" =~ "test_aspire" ]] || [[ "$act_data_set" =~ "eval_aspire" ]]; then
     # check for the best values from dev_aspire decodes
     dev_decode_dir=$(echo $decode_dir|sed "s/test_aspire/dev_aspire_whole/g; s/eval_aspire/dev_aspire_whole/g")
     if [ -f $dev_decode_dir/scoring/bestLMWT ]; then