diff --git a/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh b/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh
index a01505be695..690ef82e287 100755
--- a/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh
+++ b/egs/aspire/s5/local/segmentation/do_corruption_data_dir.sh
@@ -9,31 +9,33 @@ set -o pipefail
 
 . path.sh
 
-stage=0
-corruption_stage=-10
-corrupt_only=false
-
-# Data options
+# The following are the main parameters to modify
 data_dir=data/train_si284   # Expecting whole data directory.
-speed_perturb=true
+vad_dir=   # Output of prepare_unsad_data.sh. 
+           # If provided, the speech labels and deriv weights will be 
+           # copied into the output data directory.
+
 num_data_reps=5   # Number of corrupted versions
-snrs="20:10:15:5:0:-5"
 foreground_snrs="20:10:15:5:0:-5"
 background_snrs="20:10:15:5:2:0:-2:-5"
-base_rirs=simulated
-speeds="0.9 1.0 1.1"
+
+stage=0
 
 # Parallel options
-reco_nj=40  
-cmd=queue.pl
+nj=4
+cmd=run.pl
 
 # Options for feature extraction
 mfcc_config=conf/mfcc_hires_bp.conf
 feat_suffix=hires_bp
 
-reco_vad_dir=   # Output of prepare_unsad_data.sh. 
-                # If provided, the speech labels and deriv weights will be 
-                # copied into the output data directory.
+# Data options
+corrupt_only=false
+speed_perturb=true
+speeds="0.9 1.0 1.1"
+resample_data_dir=false
+
+
 
 . utils/parse_options.sh
 
@@ -45,16 +47,21 @@ fi
 data_id=`basename ${data_dir}`
 
 rvb_opts=()
-if [ "$base_rirs" == "simulated" ]; then
-  # This is the config for the system using simulated RIRs and point-source noises
-  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
-  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
-  rvb_opts+=(--noise-set-parameters "0.1, RIRS_NOISES/pointsource_noises/background_noise_list")
-  rvb_opts+=(--noise-set-parameters "0.9, RIRS_NOISES/pointsource_noises/foreground_noise_list")
-else
-  # This is the config for the JHU ASpIRE submission system
-  rvb_opts+=(--rir-set-parameters "1.0, RIRS_NOISES/real_rirs_isotropic_noises/rir_list")
-  rvb_opts+=(--noise-set-parameters RIRS_NOISES/real_rirs_isotropic_noises/noise_list)
+# This is the config for the system using simulated RIRs and point-source noises
+rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+rvb_opts+=(--noise-set-parameters "0.1, RIRS_NOISES/pointsource_noises/background_noise_list")
+rvb_opts+=(--noise-set-parameters "0.9, RIRS_NOISES/pointsource_noises/foreground_noise_list")
+
+if $resample_data_dir; then
+  sample_frequency=`cat $mfcc_config | perl -ne 'if (m/--sample-frequency=(\S+)/) { print $1; }'` 
+  if [ -z "$sample_frequency" ]; then
+    sample_frequency=16000
+  fi
+
+  utils/data/resample_data_dir.sh $sample_frequency ${data_dir} || exit 1
+  data_id=`basename ${data_dir}`
+  rvb_opts+=(--source-sampling-rate=$sample_frequency)
 fi
 
 corrupted_data_id=${data_id}_corrupted
@@ -119,17 +126,17 @@ else
 fi 
 
 if [ $stage -le 8 ]; then
-  if [ ! -z "$reco_vad_dir" ]; then
-    if [ ! -f $reco_vad_dir/speech_labels.scp ]; then
-      echo "$0: Could not find file $reco_vad_dir/speech_labels.scp"
+  if [ ! -z "$vad_dir" ]; then
+    if [ ! -f $vad_dir/speech_labels.scp ]; then
+      echo "$0: Could not find file $vad_dir/speech_labels.scp"
       exit 1
     fi
     
-    cat $reco_vad_dir/speech_labels.scp | \
+    cat $vad_dir/speech_labels.scp | \
       steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \
       sort -k1,1 > ${corrupted_data_dir}/speech_labels.scp
   
-    cat $reco_vad_dir/deriv_weights.scp | \
+    cat $vad_dir/deriv_weights.scp | \
       steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \
       sort -k1,1 > ${corrupted_data_dir}/deriv_weights.scp
   fi
diff --git a/egs/aspire/s5/local/segmentation/do_corruption_data_dir_music.sh b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_music.sh
index 8a5a552b2ab..7e7d84c0010 100755
--- a/egs/aspire/s5/local/segmentation/do_corruption_data_dir_music.sh
+++ b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_music.sh
@@ -13,21 +13,25 @@ set -o pipefail
 data_dir=data/train_si284
 vad_dir=      # Location of directory with VAD labels
 
-num_data_reps=5
+num_data_reps=5   # Number of corrupted versions
 foreground_snrs="5:2:1:0:-2:-5:-10:-20"
 background_snrs="5:2:1:0:-2:-5:-10:-20"
 
-cmd=run.pl
+stage=0
+
+# Parallel options
 nj=4
+cmd=run.pl
 
-stage=0
 
+# Options for feature extraction
 mfcc_config=conf/mfcc_hires_bp.conf
 feat_suffix=hires_bp
 
-dry_run=false   # If true, exits after preparing the corrupted wav.scp
+corrupt_only=false
 speed_perturb=true
 speeds="0.9 1.0 1.1"
+resample_data_dir=false
 
 label_dir=music_labels    # Directory to dump music labels
 
@@ -70,6 +74,17 @@ for f in RIRS_NOISES/simulated_rirs/smallroom/rir_list \
   echo "$0: Could not find $f" && exit 1
 done
 
+if $resample_data_dir; then
+  sample_frequency=`cat $mfcc_config | perl -ne 'if (m/--sample-frequency=(\S+)/) { print $1; }'` 
+  if [ -z "$sample_frequency" ]; then
+    sample_frequency=16000
+  fi
+
+  utils/data/resample_data_dir.sh $sample_frequency ${data_dir} || exit 1
+  data_id=`basename ${data_dir}`
+  rvb_opts+=(--source-sampling-rate=$sample_frequency)
+fi
+
 corrupted_data_id=${data_id}_music_corrupted
 orig_corrupted_data_id=$corrupted_data_id
 
@@ -87,10 +102,6 @@ if [ $stage -le 1 ]; then
     data/${data_id} data/${corrupted_data_id}
 fi
 
-if $dry_run; then
-  exit 0
-fi
-
 corrupted_data_dir=data/${corrupted_data_id}
 # Data dir without speed perturbation
 orig_corrupted_data_dir=$corrupted_data_dir   
@@ -111,6 +122,11 @@ if $speed_perturb; then
   fi
 fi
 
+if $corrupt_only; then
+  echo "$0: Got corrupted data directory in ${corrupted_data_dir}"
+  exit 0
+fi
+
 mfccdir=`basename $mfcc_config`
 mfccdir=${mfccdir%%.conf}
 
@@ -215,11 +231,13 @@ if [ $stage -le 7 ]; then
       ark:$music_dir/music_segmentation.JOB.ark \
       ark,scp:$label_dir/music_labels_${corrupted_data_id}.JOB.ark,$label_dir/music_labels_${corrupted_data_id}.JOB.scp
   fi
-fi
 
-for n in `seq $nj`; do
-  cat $label_dir/music_labels_${corrupted_data_id}.$n.scp
-done | utils/filter_scp.pl ${corrupted_data_dir}/utt2spk > ${corrupted_data_dir}/music_labels.scp
+  for n in `seq $nj`; do
+    cat $label_dir/music_labels_${corrupted_data_id}.$n.scp
+  done | \
+    steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps "music" | \
+    utils/filter_scp.pl ${corrupted_data_dir}/utt2spk > ${corrupted_data_dir}/music_labels.scp
+fi
 
 if [ $stage -le 8 ]; then
   utils/split_data.sh --per-utt ${corrupted_data_dir} $nj
diff --git a/egs/aspire/s5/local/segmentation/do_corruption_data_dir_snr.sh b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_snr.sh
index 19b4036c9aa..d98b98bd3ac 100755
--- a/egs/aspire/s5/local/segmentation/do_corruption_data_dir_snr.sh
+++ b/egs/aspire/s5/local/segmentation/do_corruption_data_dir_snr.sh
@@ -9,32 +9,33 @@ set -o pipefail
 
 . path.sh
 
-stage=0
-corruption_stage=-10
-corrupt_only=false
-
-# Data options
+# The following are the main parameters to modify
 data_dir=data/train_si284   # Expecting whole data directory.
-speed_perturb=true
+vad_dir=   # Output of prepare_unsad_data.sh. 
+           # If provided, the speech labels and deriv weights will be 
+           # copied into the output data directory.
+
 num_data_reps=5   # Number of corrupted versions
-snrs="20:10:15:5:0:-5"
 foreground_snrs="20:10:15:5:0:-5"
 background_snrs="20:10:15:5:2:0:-2:-5"
-base_rirs=simulated
-speeds="0.9 1.0 1.1"
-resample_data_dir=false
+
+stage=0
 
 # Parallel options
-reco_nj=40  
-cmd=queue.pl
+nj=4
+cmd=run.pl
 
 # Options for feature extraction
 mfcc_config=conf/mfcc_hires_bp.conf
 feat_suffix=hires_bp
 
-reco_vad_dir=   # Output of prepare_unsad_data.sh. 
-                # If provided, the speech labels and deriv weights will be 
-                # copied into the output data directory.
+# Data options
+corrupt_only=false
+speed_perturb=true
+speeds="0.9 1.0 1.1"
+resample_data_dir=false
+
+
 
 . utils/parse_options.sh
 
@@ -45,19 +46,25 @@ fi
 
 data_id=`basename ${data_dir}`
 
-rvb_opts=()
-if [ "$base_rirs" == "simulated" ]; then
-  # This is the config for the system using simulated RIRs and point-source noises
-  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
-  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
-  rvb_opts+=(--noise-set-parameters "0.1, RIRS_NOISES/pointsource_noises/background_noise_list")
-  rvb_opts+=(--noise-set-parameters "0.9, RIRS_NOISES/pointsource_noises/foreground_noise_list")
-else
-  # This is the config for the JHU ASpIRE submission system
-  rvb_opts+=(--rir-set-parameters "1.0, RIRS_NOISES/real_rirs_isotropic_noises/rir_list")
-  rvb_opts+=(--noise-set-parameters RIRS_NOISES/real_rirs_isotropic_noises/noise_list)
+if [ ! -d RIRS_NOISES/ ]; then
+  # Prepare MUSAN rirs and noises
+  wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+  unzip rirs_noises.zip
 fi
 
+rvb_opts=()
+# This is the config for the system using simulated RIRs and point-source noises
+rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+rvb_opts+=(--noise-set-parameters "0.1, RIRS_NOISES/pointsource_noises/background_noise_list")
+rvb_opts+=(--noise-set-parameters "0.9, RIRS_NOISES/pointsource_noises/foreground_noise_list")
+
+for f in RIRS_NOISES/simulated_rirs/smallroom/rir_list \
+    RIRS_NOISES/simulated_rirs/mediumroom/rir_list \
+    $data_dir/wav.scp; do 
+  echo "$0: Could not find $f" && exit 1
+done
+
 if $resample_data_dir; then
   sample_frequency=`cat $mfcc_config | perl -ne 'if (m/--sample-frequency=(\S+)/) { print $1; }'` 
   if [ -z "$sample_frequency" ]; then
@@ -134,7 +141,7 @@ if [ $stage -le 4 ]; then
   utils/copy_data_dir.sh $corrupted_data_dir ${corrupted_data_dir}_$feat_suffix
   corrupted_data_dir=${corrupted_data_dir}_$feat_suffix
   steps/make_mfcc.sh --mfcc-config $mfcc_config \
-    --cmd "$cmd" --nj $reco_nj \
+    --cmd "$cmd" --nj $reco_nj --write-utt2num-frames true \
     $corrupted_data_dir exp/make_${feat_suffix}/${corrupted_data_id} $mfccdir
   steps/compute_cmvn_stats.sh --fake \
     $corrupted_data_dir exp/make_${feat_suffix}/${corrupted_data_id} $mfccdir
@@ -202,7 +209,7 @@ if [ $stage -le 7 ]; then
     --cepstral-lifter=$cepstral_lifter \
     exp/make_irm_targets/$corrupted_data_id/idct_matrix
 
-  # Get log-SNR targets 
+  # Get log-IRM targets 
   steps/segmentation/make_snr_targets.sh \
     --nj $reco_nj --cmd "$cmd" \
     --target-type Irm --compress false \
@@ -213,21 +220,21 @@ fi
 
 
 if [ $stage -le 8 ]; then
-  if [ ! -z "$reco_vad_dir" ]; then
-    if [ ! -f $reco_vad_dir/speech_labels.scp ]; then
-      echo "$0: Could not find file $reco_vad_dir/speech_labels.scp"
+  if [ ! -z "$vad_dir" ]; then
+    if [ ! -f $vad_dir/speech_labels.scp ]; then
+      echo "$0: Could not find file $vad_dir/speech_labels.scp"
       exit 1
     fi
     
-    cat $reco_vad_dir/speech_labels.scp | \
+    cat $vad_dir/speech_labels.scp | \
       steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \
       sort -k1,1 > ${corrupted_data_dir}/speech_labels.scp
   
-    cat $reco_vad_dir/deriv_weights.scp | \
+    cat $vad_dir/deriv_weights.scp | \
       steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \
       sort -k1,1 > ${corrupted_data_dir}/deriv_weights.scp
     
-    cat $reco_vad_dir/deriv_weights_manual_seg.scp | \
+    cat $vad_dir/deriv_weights_manual_seg.scp | \
       steps/segmentation/get_reverb_scp.pl -f 1 $num_data_reps | \
       sort -k1,1 > ${corrupted_data_dir}/deriv_weights_for_irm_targets.scp
   fi
diff --git a/egs/aspire/s5/local/segmentation/prepare_unsad_data.sh b/egs/aspire/s5/local/segmentation/prepare_unsad_data.sh
index cccc7e2db84..df40337e17e 100755
--- a/egs/aspire/s5/local/segmentation/prepare_unsad_data.sh
+++ b/egs/aspire/s5/local/segmentation/prepare_unsad_data.sh
@@ -11,8 +11,8 @@ set -e
 
 stage=-2
 cmd=queue.pl
-reco_nj=40
-nj=100
+reco_nj=40      # Number of jobs to work at recording-level
+nj=100          # Number of jobs to work at utterance-level
 
 # Options to be passed to get_sad_map.py 
 map_noise_to_sil=true   # Map noise phones to silence label (0)
@@ -21,7 +21,8 @@ sad_map=    # Initial mapping from phones to speech/non-speech labels.
             # Overrides the default mapping using phones/silence.txt 
             # and phones/nonsilence.txt
 
-# Options for feature extraction
+# Options for feature extraction 
+# (These must match the features used for model_dir and sat_model_dir)
 feat_type=mfcc        # mfcc or plp
 add_pitch=false       # Add pitch features
 
@@ -117,10 +118,10 @@ function make_mfcc {
   fi
 
   if $add_pitch; then
-    steps/make_mfcc_pitch.sh --cmd "$cmd" --nj $nj \
+    steps/make_mfcc_pitch.sh --cmd "$cmd" --nj $nj --write-utt2num-frames true \
       --mfcc-config $mfcc_config --pitch-config $pitch_config $1 $2 $3 || exit 1
   else
-    steps/make_mfcc.sh --cmd "$cmd" --nj $nj \
+    steps/make_mfcc.sh --cmd "$cmd" --nj $nj --write-utt2num-frames true \
       --mfcc-config $mfcc_config $1 $2 $3 || exit 1
   fi
 
@@ -160,10 +161,10 @@ function make_plp {
   fi
   
   if $add_pitch; then
-    steps/make_plp_pitch.sh --cmd "$cmd" --nj $nj \
+    steps/make_plp_pitch.sh --cmd "$cmd" --nj $nj --write-utt2num-frames true \
       --plp-config $plp_config --pitch-config $pitch_config $1 $2 $3 || exit 1
   else
-    steps/make_plp.sh --cmd "$cmd" --nj $nj \
+    steps/make_plp.sh --cmd "$cmd" --nj $nj --write-utt2num-frames true \
       --plp-config $plp_config $1 $2 $3 || exit 1
   fi
 }
@@ -177,15 +178,16 @@ data_id=$(basename $data_dir)
 whole_data_dir=${data_dir}_whole
 whole_data_id=${data_id}_whole
 
-if [ $stage -le -2 ]; then
+if [ $stage -le -3 ]; then
   steps/segmentation/get_sad_map.py \
     --init-sad-map="$sad_map" \
     --map-noise-to-sil=$map_noise_to_sil \
     --map-unk-to-speech=$map_unk_to_speech \
     $lang | utils/sym2int.pl -f 1 $lang/phones.txt > $dir/sad_map
+fi
 
+if [ $stage -le 2 ]; then
   utils/data/convert_data_dir_to_whole.sh ${data_dir} ${whole_data_dir}
-  utils/data/get_utt2dur.sh ${whole_data_dir}
 fi 
 
 if $speed_perturb; then
@@ -232,11 +234,6 @@ if $speed_perturb; then
   data_id=${data_id}_sp
 fi
 
-
-###############################################################################
-# Compute length of recording
-###############################################################################
-
 if [ $stage -le 0 ]; then
   utils/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp
   cp $data_dir/tmp/feats.scp $data_dir
@@ -300,14 +297,6 @@ if [ $stage -le 4 ]; then
 fi
 
   
-#utils/split_data.sh --per-reco $data_dir $reco_nj
-#segmentation-combine-segments ark,s:$vad_dir/sad_seg.scp 
-#  "ark,s:segmentation-init-from-segments --shift-to-zero=false --frame-shift=$ali_frame_shift --frame-overlap=$ali_frame_overlap ${data}/split${reco_nj}reco/JOB/segments ark:- |" \
-#  "ark:cat ${data}/split${reco_nj}reco/JOB/segments | cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl | sort -k1,1 |" ark:- 
-
-###############################################################################
-
-
 # Create extended data directory that consists of the provided 
 # segments along with the segments outside it.
 # This is basically dividing the whole recording into pieces
@@ -320,49 +309,30 @@ fi
 
 outside_data_dir=$dir/${data_id}_outside
 if [ $stage -le 5 ]; then
-  rm -rf $outside_data_dir
-  mkdir -p $outside_data_dir/split${reco_nj}reco
+  rm -r $outside_data_dir || true
 
   for f in wav.scp reco2file_and_channel stm glm; do 
     [ -f ${data_dir}/$f ] && cp ${data_dir}/$f $outside_data_dir
   done
    
-  steps/segmentation/split_data_on_reco.sh $data_dir $whole_data_dir $reco_nj
-
-  for n in `seq $reco_nj`; do 
-    dsn=$whole_data_dir/split${reco_nj}reco/$n
-    awk '{print $2}' $dsn/segments | \
-      utils/filter_scp.pl /dev/stdin $whole_data_dir/utt2num_frames > \
-      $dsn/utt2num_frames
-    mkdir -p $outside_data_dir/split${reco_nj}reco/$n
-  done
+  utils/data/get_utt2num_frames.sh $whole_data_dir
 
-  $cmd JOB=1:$reco_nj $outside_data_dir/log/get_empty_segments.JOB.log \
+  $cmd $outside_data_dir/log/get_empty_segments.log \
     segmentation-init-from-segments --frame-shift=$frame_shift \
-    --frame-overlap=$frame_overlap --shift-to-zero=false \
-    ${data_dir}/split${reco_nj}reco/JOB/segments ark:- \| \
+      --frame-overlap=$frame_overlap --shift-to-zero=false \
+      ${data_dir}/segments ark:- \| \
     segmentation-combine-segments-to-recordings ark:- \
-    "ark,t:cut -d ' ' -f 1,2 ${data_dir}/split${reco_nj}reco/JOB/segments  | utils/utt2spk_to_spk2utt.pl |" ark:- \| \
+      "ark,t:utils/data/get_reco2utt.sh ${data_dir} |" ark:- \| \
     segmentation-create-subsegments --filter-label=1 --subsegment-label=0 \
-    "ark:segmentation-init-from-lengths --label=1 ark,t:${whole_data_dir}/split${reco_nj}reco/JOB/utt2num_frames ark:- |" \
-    ark:- ark:- \| \
+      "ark:segmentation-init-from-lengths --label=1 ark,t:${whole_data_dir}/utt2num_frames ark:- |" \
+      ark:- ark:- \| \
     segmentation-post-process --remove-labels=0 --max-segment-length=1000 \
-    --post-process-label=1 --overlap-length=50 \
-    ark:- ark:- \| segmentation-to-segments --single-speaker=true \
-    --frame-shift=$frame_shift --frame-overlap=$frame_overlap \
-    ark:- ark,t:$outside_data_dir/split${reco_nj}reco/JOB/utt2spk \
-    $outside_data_dir/split${reco_nj}reco/JOB/segments || exit 1
-
-  for n in `seq $reco_nj`; do
-    cat $outside_data_dir/split${reco_nj}reco/$n/utt2spk
-  done | sort -k1,1 > $outside_data_dir/utt2spk
-  
-  for n in `seq $reco_nj`; do
-    cat $outside_data_dir/split${reco_nj}reco/$n/segments
-  done | sort -k1,1 > $outside_data_dir/segments
+      --post-process-label=1 --overlap-length=50 ark:- ark:- \| \
+    segmentation-to-segments --single-speaker=true \
+      --frame-shift=$frame_shift --frame-overlap=$frame_overlap \
+      ark:- ark,t:$outside_data_dir/utt2spk $outside_data_dir/segments
 
   utils/fix_data_dir.sh $outside_data_dir
-  
 fi
 
 
@@ -378,8 +348,6 @@ if [ $stage -le 7 ]; then
   utils/fix_data_dir.sh $outside_data_dir
   
   utils/combine_data.sh $extended_data_dir $data_dir $outside_data_dir
-
-  steps/segmentation/split_data_on_reco.sh $data_dir $extended_data_dir $reco_nj
 fi
 
 ###############################################################################
@@ -442,15 +410,29 @@ reco_vad_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir";
 echo $reco_nj > $reco_vad_dir/num_jobs
 
 if [ $stage -le 11 ]; then
+  utils/data/get_reco2utt.sh $extended_data_dir > $reco_vad_dir/reco2utt
+  splits=
+  for n in `seq $reco_nj`; do
+    splits="$splits $reco_vad_dir/reco2utt.$n.$reco_nj"
+  done
+  utils/split_scp.pl $reco_vad_dir/reco2utt $splits
+  
+  for n in `seq $reco_nj`; do
+    utils/spk2utt_to_utt2spk.pl $reco_vad_dir/reco2utt.$n.$reco_nj > $reco_vad_dir/utt2reco.$n.$reco_nj
+  done
+
   $cmd JOB=1:$reco_nj $reco_vad_dir/log/intersect_vad.JOB.log \
-    segmentation-intersect-segments --mismatch-label=10 \
-    "scp:cat $vad_dir/sad_seg.scp $vad_dir/outside_sad_seg.scp | sort -k1,1 | utils/filter_scp.pl $extended_data_dir/split${reco_nj}reco/JOB/utt2spk |" \
-    "scp:utils/filter_scp.pl $extended_data_dir/split${reco_nj}reco/JOB/utt2spk $decode_vad_dir/sad_seg.scp |" \
-    ark:- \| segmentation-post-process --remove-labels=10 \
-    --merge-adjacent-segments --max-intersegment-length=10 ark:- ark:- \| \
-    segmentation-combine-segments ark:- "ark:segmentation-init-from-segments --shift-to-zero=false $extended_data_dir/split${reco_nj}reco/JOB/segments ark:- |" \
-    ark,t:$extended_data_dir/split${reco_nj}reco/JOB/reco2utt \
-    ark,scp:$reco_vad_dir/sad_seg.JOB.ark,$reco_vad_dir/sad_seg.JOB.scp
+    segmentation-intersect-segments --mismatch-label=1000 \
+      "scp:cat $vad_dir/sad_seg.scp $vad_dir/outside_sad_seg.scp | sort -k1,1 | utils/filter_scp.pl $reco_vad_dir/utt2reco.JOB.$reco_nj |" \
+      "scp:utils/filter_scp.pl $reco_vad_dir/utt2reco.JOB.$reco_nj $decode_vad_dir/sad_seg.scp |" \
+      ark:- \| \
+    segmentation-post-process --remove-labels=1000 \
+      --merge-adjacent-segments --max-intersegment-length=10 ark:- ark:- \| \
+    segmentation-combine-segments ark:- \
+      "ark:utils/filter_scp.pl $reco_vad_dir/utt2reco.JOB.$reco_nj $extended_data_dir/segments | segmentation-init-from-segments --shift-to-zero=false - ark:- |" \
+      ark,t:$reco_vad_dir/reco2utt.JOB.$reco_nj \
+      ark,scp:$reco_vad_dir/sad_seg.JOB.ark,$reco_vad_dir/sad_seg.JOB.scp
+
   for n in `seq $reco_nj`; do
     cat $reco_vad_dir/sad_seg.$n.scp
   done > $reco_vad_dir/sad_seg.scp
@@ -464,51 +446,58 @@ for n in `seq $reco_nj`; do
 done
 set -e
 
+# Deriv weights to train only on "good" frames, i.e. where alignment and decoding match
 if [ $stage -le 12 ]; then
   $cmd JOB=1:$reco_nj $reco_vad_dir/log/get_deriv_weights.JOB.log \
     segmentation-post-process --merge-labels=0:1:2:3 --merge-dst-label=1 \
-    scp:$reco_vad_dir/sad_seg.JOB.scp ark:- \| \
+      scp:$reco_vad_dir/sad_seg.JOB.scp ark:- \| \
     segmentation-to-ali --lengths-rspecifier=ark,t:${whole_data_dir}/utt2num_frames ark:- ark,t:- \| \
     steps/segmentation/convert_ali_to_vec.pl \| copy-vector ark,t:- \
-    ark,scp:$reco_vad_dir/deriv_weights.JOB.ark,$reco_vad_dir/deriv_weights.JOB.scp
+      ark,scp:$reco_vad_dir/deriv_weights.JOB.ark,$reco_vad_dir/deriv_weights.JOB.scp
   
   for n in `seq $reco_nj`; do
     cat $reco_vad_dir/deriv_weights.$n.scp
   done > $reco_vad_dir/deriv_weights.scp
 fi
 
+# Deriv weights to train only on silence frames
 if [ $stage -le 13 ]; then
   $cmd JOB=1:$reco_nj $reco_vad_dir/log/get_deriv_weights_for_uncorrupted.JOB.log \
-    segmentation-post-process --remove-labels=1:2:3 scp:$reco_vad_dir/sad_seg.JOB.scp \
-    ark:- \| segmentation-post-process --merge-labels=0 --merge-dst-label=1 ark:- ark:- \| \
+    segmentation-post-process --remove-labels=1:2:3 scp:$reco_vad_dir/sad_seg.JOB.scp ark:- \| \
+    segmentation-post-process --merge-labels=0 --merge-dst-label=1 ark:- ark:- \| \
     segmentation-to-ali --lengths-rspecifier=ark,t:${whole_data_dir}/utt2num_frames ark:- ark,t:- \| \
     steps/segmentation/convert_ali_to_vec.pl \| copy-vector ark,t:- \
-    ark,scp:$reco_vad_dir/deriv_weights_for_uncorrupted.JOB.ark,$reco_vad_dir/deriv_weights_for_uncorrupted.JOB.scp
+      ark,scp:$reco_vad_dir/deriv_weights_for_uncorrupted.JOB.ark,$reco_vad_dir/deriv_weights_for_uncorrupted.JOB.scp
+
   for n in `seq $reco_nj`; do
     cat $reco_vad_dir/deriv_weights_for_uncorrupted.$n.scp
   done > $reco_vad_dir/deriv_weights_for_uncorrupted.scp
 fi
 
+# Get per-frame SAD labels at recording-level
 if [ $stage -le 14 ]; then
   $cmd JOB=1:$reco_nj $reco_vad_dir/log/get_speech_labels.JOB.log \
     segmentation-copy --keep-label=1 scp:$reco_vad_dir/sad_seg.JOB.scp ark:- \| \
     segmentation-to-ali --lengths-rspecifier=ark,t:${whole_data_dir}/utt2num_frames \
-    ark:- ark,scp:$reco_vad_dir/speech_labels.JOB.ark,$reco_vad_dir/speech_labels.JOB.scp
+      ark:- ark,scp:$reco_vad_dir/speech_labels.JOB.ark,$reco_vad_dir/speech_labels.JOB.scp
+
   for n in `seq $reco_nj`; do
     cat $reco_vad_dir/speech_labels.$n.scp
   done > $reco_vad_dir/speech_labels.scp
 fi
 
+# Deriv weights to train only on manual segments
 if [ $stage -le 15 ]; then
   $cmd JOB=1:$reco_nj $reco_vad_dir/log/convert_manual_segments_to_deriv_weights.JOB.log \
-    segmentation-init-from-segments --shift-to-zero=false \
-    $data_dir/split${reco_nj}reco/JOB/segments ark:- \| \
+    segmentation-init-from-segments --shift-to-zero=false --frame-shift=$frame_shift --frame-overlap=$frame_overlap \
+      "utils/filter_scp.pl $reco_vad_dir/utt2reco.JOB.$reco_nj $data_dir/segments |" ark:- \| \
     segmentation-combine-segments-to-recordings ark:- \
-    ark:$data_dir/split${reco_nj}reco/JOB/reco2utt ark:- \| \
+      "ark,t:utils/data/get_reco2utt.sh $data_dir | utils/filter_scp.pl $reco_vad_dir/reco2utt.JOB.$reco_nj |" \
+      ark:- \| \
     segmentation-to-ali --lengths-rspecifier=ark,t:${whole_data_dir}/utt2num_frames \
-    ark:- ark,t:- \| \
+      ark:- ark,t:- \| \
     steps/segmentation/convert_ali_to_vec.pl \| copy-vector ark,t:- \
-    ark,scp:$reco_vad_dir/deriv_weights_manual_seg.JOB.ark,$reco_vad_dir/deriv_weights_manual_seg.JOB.scp
+      ark,scp:$reco_vad_dir/deriv_weights_manual_seg.JOB.ark,$reco_vad_dir/deriv_weights_manual_seg.JOB.scp
 
   for n in `seq $reco_nj`; do
     cat $reco_vad_dir/deriv_weights_manual_seg.$n.scp
diff --git a/egs/wsj/s5/steps/segmentation/split_data_on_reco.sh b/egs/wsj/s5/steps/segmentation/split_data_on_reco.sh
deleted file mode 100755
index b85c4493cdf..00000000000
--- a/egs/wsj/s5/steps/segmentation/split_data_on_reco.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#! /bin/bash
-
-# Copyright 2016  Vimal Manohar
-# Apache 2.0
-
-set -e 
-
-if [ $# -ne 3 ]; then
-  echo "This script splits <data-dir> on recording-level in the same "
-  echo "way as <ref-data-dir> is split."
-  echo "Usage: split_data_on_reco.sh <ref-data-dir> <data-dir> <nj>"
-  exit 1
-fi
-
-ref_data=$1
-data=$2
-nj=$3
-
-utils/data/get_reco2utt.sh $ref_data
-utils/data/get_reco2utt.sh $data
-
-utils/split_data.sh --per-reco $ref_data $nj
- 
-for n in `seq $nj`; do 
-  srn=$ref_data/split${nj}reco/$n
-  dsn=$data/split${nj}reco/$n
-  
-  mkdir -p $dsn
-
-  utils/data/get_reco2utt.sh $srn
-  utils/filter_scp.pl $srn/reco2utt $data/reco2utt > $dsn/reco2utt
-  utils/spk2utt_to_utt2spk.pl $dsn/reco2utt > $dsn/utt2reco 
-  utils/subset_data_dir.sh --utt-list $dsn/utt2reco $data $dsn
-done
diff --git a/egs/wsj/s5/utils/split_data.sh b/egs/wsj/s5/utils/split_data.sh
index 94ba4f555ce..bc5894e7551 100755
--- a/egs/wsj/s5/utils/split_data.sh
+++ b/egs/wsj/s5/utils/split_data.sh
@@ -16,28 +16,20 @@
 # limitations under the License.
 
 split_per_spk=true
-split_per_reco=false
 if [ "$1" == "--per-utt" ]; then
   split_per_spk=false
   shift
-elif [ "$1" == "--per-reco" ]; then
-  split_per_spk=false
-  split_per_reco=true
-  shift
 fi
 
 if [ $# != 2 ]; then
-  echo "Usage: $0 [--per-utt|--per-reco] <data-dir> <num-to-split>"
+  echo "Usage: $0 [--per-utt] <data-dir> <num-to-split>"
   echo "E.g.: $0 data/train 50"
   echo "It creates its output in e.g. data/train/split50/{1,2,3,...50}, or if the "
   echo "--per-utt option was given, in e.g. data/train/split50utt/{1,2,3,...50}."
-  echo "If the --per-reco option was given, in e.g. data/train/split50reco/{1,2,3,...50}."
   echo ""
   echo "This script will not split the data-dir if it detects that the output is newer than the input."
   echo "By default it splits per speaker (so each speaker is in only one split dir),"
   echo "but with the --per-utt option it will ignore the speaker information while splitting."
-  echo "But if --per-reco option is given, it splits per recording "
-  echo "(so each recording is in only one split dir)"
   exit 1
 fi
 
@@ -75,14 +67,10 @@ if [ -f $data/text ] && [ $nu -ne $nt ]; then
   echo "** use utils/fix_data_dir.sh to fix this."
 fi
 
+
 if $split_per_spk; then
   utt2spk_opt="--utt2spk=$data/utt2spk"
   utt=""
-elif $split_per_reco; then
-  utils/data/get_reco2utt.sh $data
-  utils/spk2utt_to_utt2spk.pl $data/reco2utt > $data/utt2reco
-  utt2spk_opt="--utt2spk=$data/utt2reco"
-  utt="reco"
 else
   utt2spk_opt=
   utt="utt"
@@ -106,7 +94,6 @@ if ! $need_to_split; then
 fi
 
 utt2spks=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n/utt2spk; done)
-utt2recos=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n/utt2reco; done)
 
 directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n; done)
 
@@ -121,20 +108,11 @@ fi
 which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
 trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM
 
-if $split_per_reco; then
-  utils/split_scp.pl $utt2spk_opt $data/utt2reco $utt2recos || exit 1
-else
-  utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
-fi
+utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
 
 for n in `seq $numsplit`; do
   dsn=$data/split${numsplit}${utt}/$n
-
-  if $split_per_reco; then
-    utils/filter_scp.pl $dsn/utt2reco $data/utt2spk > $dsn/utt2spk
-  fi
-
-  utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1
+  utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
 done
 
 maybe_wav_scp=
@@ -176,12 +154,6 @@ if [ -f $data/segments ]; then
       $data/split${numsplit}${utt}/JOB/tmp.reco $data/wav.scp \
       $data/split${numsplit}${utt}/JOB/wav.scp || exit 1
   fi
-  if [ -f $data/reco2utt ]; then
-    utils/filter_scps.pl JOB=1:$numsplit \
-      $data/split${numsplit}${utt}/JOB/tmp.reco $data/reco2utt \
-      $data/split${numsplit}${utt}/JOB/reco2utt || exit 1
-  fi
-
   for f in $data/split${numsplit}${utt}/*/tmp.reco; do rm $f; done
 fi