From af735eb75bf55e6b1e41602105a6f939aedbaf5c Mon Sep 17 00:00:00 2001 From: Zengwei Yao Date: Wed, 8 Feb 2023 21:54:35 +0800 Subject: [PATCH] Get alignments using lhotse workflows align-with-torchaudio (#888) * add lhotse workflow align-with-torchaudio * modify related decode.py files --- egs/librispeech/ASR/add_alignments.sh | 50 +++++++++++++++++-- egs/librispeech/ASR/conformer_ctc3/decode.py | 5 +- .../ASR/lstm_transducer_stateless3/decode.py | 5 +- .../pruned_transducer_stateless4/decode.py | 5 +- 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/egs/librispeech/ASR/add_alignments.sh b/egs/librispeech/ASR/add_alignments.sh index 5e4480bf6f..6c47d25a20 100755 --- a/egs/librispeech/ASR/add_alignments.sh +++ b/egs/librispeech/ASR/add_alignments.sh @@ -2,11 +2,51 @@ set -eou pipefail -alignments_dir=data/alignment +# align could be in ("mfa", "torchaudio") +# We recommend "torchaudio" +align="torchaudio" + +# It adds alignments to the existing fbank features dir (e.g., data/fbank) +# and save cuts to a new dir (e.g., data/fbank_ali). cuts_in_dir=data/fbank cuts_out_dir=data/fbank_ali -python3 ./local/add_alignment_librispeech.py \ - --alignments-dir $alignments_dir \ - --cuts-in-dir $cuts_in_dir \ - --cuts-out-dir $cuts_out_dir +if [ $align == "mfa" ]; then + # It add alignments from https://github.com/CorentinJ/librispeech-alignments, + # generated using the Montreal Forced Aligner (https://montreal-forced-aligner.readthedocs.io). + alignments_dir=data/alignment + + python3 ./local/add_alignment_librispeech.py \ + --alignments-dir $alignments_dir \ + --cuts-in-dir $cuts_in_dir \ + --cuts-out-dir $cuts_out_dir +elif [ $align == "torchaudio" ]; then + # See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/bin/modes/workflows.py for details. + # + # It use a pretrained ASR model from torchaudio to generate alignments. + # It will attach word-level alignment information (start, end, and score) to the + # supervisions in each cut. + mkdir -p $cuts_out_dir + + parts=( + train-clean-100 + train-clean-360 + train-other-500 + test-clean + test-other + dev-clean + dev-other + ) + + echo "The alignments will be saved to $cuts_out_dir" + for part in ${parts[@]}; do + echo "Start to align $part" + lhotse workflows align-with-torchaudio --dont-normalize-text \ + $cuts_in_dir/librispeech_cuts_${part}.jsonl.gz \ + $cuts_out_dir/librispeech_cuts_${part}.jsonl.gz + done + echo "Finished" +else + echo "align is expected to be in ('mfa', 'torchaudio'), but got $align" + exit 1 +fi diff --git a/egs/librispeech/ASR/conformer_ctc3/decode.py b/egs/librispeech/ASR/conformer_ctc3/decode.py index 3b24ad5971..2300fecc3a 100755 --- a/egs/librispeech/ASR/conformer_ctc3/decode.py +++ b/egs/librispeech/ASR/conformer_ctc3/decode.py @@ -40,10 +40,7 @@ To evaluate symbol delay, you should: (1) Generate cuts with word-time alignments: -./local/add_alignment_librispeech.py \ - --alignments-dir data/alignment \ - --cuts-in-dir data/fbank \ - --cuts-out-dir data/fbank_ali +./add_alignments.sh (2) Set the argument "--manifest-dir data/fbank_ali" while decoding. For example: ./conformer_ctc3/decode.py \ diff --git a/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py index b7953e5e39..832b994337 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py @@ -94,10 +94,7 @@ To evaluate symbol delay, you should: (1) Generate cuts with word-time alignments: -./local/add_alignment_librispeech.py \ - --alignments-dir data/alignment \ - --cuts-in-dir data/fbank \ - --cuts-out-dir data/fbank_ali +./add_alignments.sh (2) Set the argument "--manifest-dir data/fbank_ali" while decoding. For example: ./lstm_transducer_stateless3/decode.py \ diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py index f5cbc21f77..5fa129a898 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py @@ -109,10 +109,7 @@ To evaluate symbol delay, you should: (1) Generate cuts with word-time alignments: -./local/add_alignment_librispeech.py \ - --alignments-dir data/alignment \ - --cuts-in-dir data/fbank \ - --cuts-out-dir data/fbank_ali +./add_alignments.sh (2) Set the argument "--manifest-dir data/fbank_ali" while decoding. For example: ./pruned_transducer_stateless4/decode.py \