add mini_librispeech example

YiwenShaoStephen · Aug 26, 2020 · f2ccc3b · f2ccc3b
1 parent 79c1b7c
commit f2ccc3b
Show file tree

Hide file tree

Showing 15 changed files with 511 additions and 0 deletions.
diff --git a/examples/mini_librispeech/cmd.sh b/examples/mini_librispeech/cmd.sh
@@ -0,0 +1,21 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+#export train_cmd="run.pl --mem 4G"
+#export cuda_cmd="run.pl --mem 4G --gpu 1"
+#export decode_cmd="run.pl --mem 4G"
+
+# JHU setup
+export train_cmd="queue.pl --mem 4G"
+#export cuda_cmd="queue.pl --mem 4G --gpu 1 --config conf/gpu.conf"
+export cuda_cmd="queue-freegpu.pl --mem 4G --gpu 1 --config conf/gpu.conf"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/examples/mini_librispeech/conf/mfcc_hires.conf b/examples/mini_librispeech/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/examples/mini_librispeech/local/data_prep.sh b/examples/mini_librispeech/local/data_prep.sh
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/mini_librispeech/s5/local/data_prep.sh
diff --git a/examples/mini_librispeech/local/download_and_untar.sh b/examples/mini_librispeech/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/mini_librispeech/s5/local/download_and_untar.sh
diff --git a/examples/mini_librispeech/local/download_lm.sh b/examples/mini_librispeech/local/download_lm.sh
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/mini_librispeech/s5/local/download_lm.sh
diff --git a/examples/mini_librispeech/local/format_lms.sh b/examples/mini_librispeech/local/format_lms.sh
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/mini_librispeech/s5/local/format_lms.sh
diff --git a/examples/mini_librispeech/local/prepare_dict.sh b/examples/mini_librispeech/local/prepare_dict.sh
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/mini_librispeech/s5/local/prepare_dict.sh
diff --git a/examples/mini_librispeech/local/score_kaldi_wer.sh b/examples/mini_librispeech/local/score_kaldi_wer.sh
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh
diff --git a/examples/mini_librispeech/path.sh b/examples/mini_librispeech/path.sh
@@ -0,0 +1,15 @@
+MAIN_ROOT=$PWD/../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+# BEGIN from kaldi path.sh
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+# END
+
+export PATH=$MAIN_ROOT:$MAIN_ROOT/tools:$PATH
+export LD_LIBRARY_PATH=$MAIN_ROOT/tools/pychain/openfst/lib:$LD_LIBRARY_PATH
+export PYTHONPATH=$MAIN_ROOT:$MAIN_ROOT/tools:$MAIN_ROOT/tools/pychain:$PYTHONPATH
+export PYTHONUNBUFFERED=1
diff --git a/examples/mini_librispeech/prepare_e2e.sh b/examples/mini_librispeech/prepare_e2e.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+# Copyright 2017  Hossein Hadian
+#           2020  Yiwen Shao
+# Apache 2.0
+
+# To be run from ..
+# Flat start chain model training.
+
+# This script initializes a trivial tree and transition model
+# for flat-start chain training. It then generates the training
+# graphs for the training data.
+
+# Begin configuration section.
+cmd=run.pl
+nj=4
+stage=0
+shared_phones=true
+treedir=              # If specified, the tree and model will be copied from there
+                      # note that it may not be flat start anymore.
+type=mono             # Can be either mono or biphone -- either way
+                      # the resulting tree is full (i.e. it doesn't do any tying)
+ci_silence=false      # If true, silence phones will be treated as context independent
+
+scale_opts="--transition-scale=0.0 --self-loop-scale=0.0"
+tie=false             # If true, gmm-init-biphone will do some tying when
+                      # creating the full biphone tree (it won't be full anymore).
+                      # Specifically, it will revert to monophone if the data
+                      # counts for a biphone are smaller than min_biphone_count.
+                      # If the monophone count is also smaller than min_monophone_count,
+                      # it will revert to a shared global phone. Note that this
+                      # only affects biphone models (i.e., type=biphone) which
+                      # use the special chain topology.
+min_biphone_count=100
+min_monophone_count=20
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: steps/prepare_e2e.sh [options] <data-dir> <lang-dir> <tree-dir>"
+  echo " e.g.: steps/prepare_e2e.sh data/train data/lang_chain exp/chain/e2e_tree"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --type <mono | biphone>                          # context dependency type"
+  echo "  --tie <true | false>                             # enable/disable count-based tying"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+dir=$3
+
+if [[ "$type" != "mono" && "$type" != "biphone" ]]; then
+  echo "'type' should be either mono or biphone."
+  exit 1;
+fi
+
+mkdir -p $dir/log
+
+echo $scale_opts > $dir/scale_opts  # just for easier reference (it is in the logs too)
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $lang/phones.txt $dir || exit 1;
+
+[ ! -f $lang/phones/sets.int ] && exit 1;
+
+if $shared_phones; then
+  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
+fi
+
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+if $ci_silence; then
+  ci_opt="--ci-phones=$ciphonelist"
+fi
+
+tie_opts=
+if $tie && [[ "$type" = "biphone" ]]; then
+  cat $data/text | steps/chain/e2e/text_to_phones.py --edge-silprob 0 \
+                                                     --between-silprob 0 \
+                                                     $lang | \
+    cut -d' ' -f 2- | utils/sym2int.pl $lang/phones.txt | \
+    steps/chain/e2e/compute_biphone_stats.py $lang >$dir/phone-stats.txt
+  tie_opts="--min-biphone-count=$min_biphone_count \
+--min-monophone-count=$min_monophone_count --phone-counts=$dir/phone-stats.txt"
+fi
+
+if [ $stage -le 0 ]; then
+  if [ -z $treedir ]; then
+    echo "$0: Initializing $type system."
+    # feat dim does not matter here. Just set it to 10
+    $cmd $dir/log/init_${type}_mdl_tree.log \
+         gmm-init-$type $tie_opts $ci_opt $shared_phones_opt $lang/topo 10 \
+         $dir/0.mdl $dir/tree || exit 1;
+  else
+    echo "$0: Copied tree/mdl from $treedir." >$dir/log/init_mdl_tree.log
+    cp $treedir/final.mdl $dir/0.mdl || exit 1;
+    cp $treedir/tree $dir || exit 1;
+  fi
+  copy-transition-model $dir/0.mdl $dir/0.trans_mdl
+  ln -s 0.mdl $dir/final.mdl  # for consistency with scripts which require a final.mdl
+fi
diff --git a/examples/mini_librispeech/prepare_feat.sh b/examples/mini_librispeech/prepare_feat.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright (c) Yiwen Shao
+
+# Apache 2.0
+
+# data related
+rootdir=data
+dumpdir=data/dump   # directory to dump full features
+
+train_set=train_clean_5
+valid_set=dev_clean_2
+
+train_subset_size=0
+stage=0
+
+# feature configuration
+do_delta=false
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+if [ ${stage} -le 0 ]; then
+  echo "Extracting MFCC features"
+  for x in $train_set $valid_set; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 \
+                       --mfcc-config conf/mfcc_hires.conf $rootdir/${x}
+    # compute global CMVN
+    compute-cmvn-stats scp:$rootdir/${x}/feats.scp $rootdir/${x}/cmvn.ark
+  done
+fi
+
+train_feat_dir=${dumpdir}/${train_set}; mkdir -p ${train_feat_dir}
+valid_feat_dir=${dumpdir}/${valid_set}; mkdir -p ${valid_feat_dir}
+
+if [ ${stage} -le 1 ]; then
+  echo "Dumping Features with CMVN"
+  dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+    $rootdir/${train_set}/feats.scp $rootdir/${train_set}/cmvn.ark ${train_feat_dir}/log ${train_feat_dir}
+  dump.sh --cmd "$train_cmd" --nj 4 --do_delta $do_delta \
+    $rootdir/${valid_set}/feats.scp $rootdir/${valid_set}/cmvn.ark ${valid_feat_dir}/log ${valid_feat_dir}
+
+fi
+
+# randomly select a subset of train set for optional diagnosis
+if [ $train_subset_size -gt 0 ]; then
+  train_subset_feat_dir=${dumpdir}/${train_set}_${train_subset_size}; mkdir -p ${train_subset_feat_dir}
+  utils/subset_data_dir.sh $rootdir/${train_set} ${train_subset_size} $rootdir/${train_set}_${train_subset_size}
+  utils/filter_scp.pl $rootdir/${train_set}_${train_subset_size}/utt2spk ${train_feat_dir}/feats.scp \
+		      > ${train_subset_feat_dir}/feats.scp
+fi
diff --git a/examples/mini_librispeech/prepare_graph.sh b/examples/mini_librispeech/prepare_graph.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# Copyright (c) Yiwen Shao
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e -o pipefail
+
+stage=0
+train_set=train_clean_5
+valid_set=dev_clean_2
+rootdir=data
+
+langdir=data/lang
+graphdir=data/graph
+type=mono
+unit=phone
+
+nj=10
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+lang=$langdir/lang_${type}${unit}_e2e
+graph=$graphdir/${type}${unit}
+
+if [ $stage -le 0 ]; then
+  echo "$0: Stage 0: Phone LM estimating"
+  rm -rf $lang
+  cp -r $langdir/lang_nosp $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+
+  echo "Estimating a phone language model for the denominator graph..."
+  mkdir -p $graph/log
+  $train_cmd $graph/log/make_phone_lm.log \
+             cat $rootdir/$train_set/text \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             $langdir/lang_nosp \| \
+             utils/sym2int.pl -f 2- $langdir/lang_nosp/phones.txt \| \
+             chain-est-phone-lm --num-extra-lm-states=2000 \
+             ark:- $graph/phone_lm.fst
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Stage 1: Graph generation..."
+  if [ $type == "bi" ]; then
+    type_arg=biphone # prepare_e2e.sh take either "mono" or "biphone" as arguments
+  else
+    type_arg=$type
+  fi
+  prepare_e2e.sh --nj $nj --cmd "$train_cmd" \
+		 --type $type_arg \
+		 --shared-phones true \
+		 $rootdir/$train_set $lang $graph
+  echo "Making denominator graph..."
+  $train_cmd $graph/log/make_den_fst.log \
+	     chain-make-den-fst $graph/tree $graph/0.trans_mdl \
+	     $graph/phone_lm.fst \
+	     $graph/den.fst $graph/normalization.fst
+fi
+
+
+if [ $stage -le 2 ]; then
+  echo "Making numerator graph..."
+  lex=$lang/L.fst
+  oov_sym=`cat $lang/oov.int` || exit 1;
+  for x in $train_set $valid_set; do
+    sdata=$rootdir/$x/split$nj;
+    [[ -d $sdata && $rootdir/$x/feats.scp -ot $sdata ]] || split_data.sh $rootdir/$x $nj || exit 1;
+    $train_cmd JOB=1:$nj $graph/$x/log/compile_graphs.JOB.log \
+    	       compile-train-graphs $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
+    	       $graph/tree $graph/0.mdl $lex \
+    	       "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata/JOB/text|" \
+    	       "ark,scp:$graph/$x/fst.JOB.ark,$graph/$x/fst.JOB.scp" || exit 1;
+    $train_cmd JOB=1:$nj $graph/$x/log/make_num_fst.JOB.log \
+    	       chain-make-num-fst-e2e $graph/0.trans_mdl $graph/normalization.fst \
+    	       scp:$graph/$x/fst.JOB.scp ark,scp:$graph/$x/num.JOB.ark,$graph/$x/num.JOB.scp
+    for id in $(seq $nj); do cat $graph/$x/num.$id.scp; done > $graph/$x/num.scp
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "Making HCLG full graph..."
+  utils/lang/check_phones_compatible.sh \
+    $langdir/lang_nosp_test_tgsmall/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $langdir/lang_nosp_test_tgsmall \
+    $graph $graph/graph_tgsmall || exit 1;
+fi
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../tools/kaldi/egs/mini_librispeech/s5/local/data_prep.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../tools/kaldi/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh