From af735eb75bf55e6b1e41602105a6f939aedbaf5c Mon Sep 17 00:00:00 2001
From: Zengwei Yao <yaozengwei@outlook.com>
Date: Wed, 8 Feb 2023 21:54:35 +0800
Subject: [PATCH] Get alignments using lhotse workflows align-with-torchaudio
 (#888)

* add lhotse workflow align-with-torchaudio

* modify related decode.py files
---
 egs/librispeech/ASR/add_alignments.sh         | 50 +++++++++++++++++--
 egs/librispeech/ASR/conformer_ctc3/decode.py  |  5 +-
 .../ASR/lstm_transducer_stateless3/decode.py  |  5 +-
 .../pruned_transducer_stateless4/decode.py    |  5 +-
 4 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/egs/librispeech/ASR/add_alignments.sh b/egs/librispeech/ASR/add_alignments.sh
index 5e4480bf6f..6c47d25a20 100755
--- a/egs/librispeech/ASR/add_alignments.sh
+++ b/egs/librispeech/ASR/add_alignments.sh
@@ -2,11 +2,51 @@
 
 set -eou pipefail
 
-alignments_dir=data/alignment
+# align could be in ("mfa", "torchaudio")
+# We recommend "torchaudio"
+align="torchaudio"
+
+# It adds alignments to the existing fbank features dir (e.g., data/fbank)
+# and save cuts to a new dir (e.g., data/fbank_ali).
 cuts_in_dir=data/fbank
 cuts_out_dir=data/fbank_ali
 
-python3 ./local/add_alignment_librispeech.py \
-  --alignments-dir $alignments_dir \
-  --cuts-in-dir $cuts_in_dir \
-  --cuts-out-dir $cuts_out_dir
+if [ $align == "mfa" ]; then
+  # It add alignments from https://github.com/CorentinJ/librispeech-alignments,
+  # generated using the Montreal Forced Aligner (https://montreal-forced-aligner.readthedocs.io).
+  alignments_dir=data/alignment
+
+  python3 ./local/add_alignment_librispeech.py \
+    --alignments-dir $alignments_dir \
+    --cuts-in-dir $cuts_in_dir \
+    --cuts-out-dir $cuts_out_dir
+elif [ $align == "torchaudio" ]; then
+  # See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/bin/modes/workflows.py for details.
+  #
+  # It use a pretrained ASR model from torchaudio to generate alignments.
+  # It will attach word-level alignment information (start, end, and score) to the
+  # supervisions in each cut.
+  mkdir -p $cuts_out_dir
+
+  parts=(
+    train-clean-100
+    train-clean-360
+    train-other-500
+    test-clean
+    test-other
+    dev-clean
+    dev-other
+  )
+
+  echo "The alignments will be saved to $cuts_out_dir"
+  for part in ${parts[@]}; do
+    echo "Start to align $part"
+    lhotse workflows align-with-torchaudio --dont-normalize-text \
+      $cuts_in_dir/librispeech_cuts_${part}.jsonl.gz \
+      $cuts_out_dir/librispeech_cuts_${part}.jsonl.gz
+  done
+  echo "Finished"
+else
+  echo "align is expected to be in ('mfa', 'torchaudio'), but got $align"
+  exit 1
+fi
diff --git a/egs/librispeech/ASR/conformer_ctc3/decode.py b/egs/librispeech/ASR/conformer_ctc3/decode.py
index 3b24ad5971..2300fecc3a 100755
--- a/egs/librispeech/ASR/conformer_ctc3/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc3/decode.py
@@ -40,10 +40,7 @@
 
 To evaluate symbol delay, you should:
 (1) Generate cuts with word-time alignments:
-./local/add_alignment_librispeech.py \
-    --alignments-dir data/alignment \
-    --cuts-in-dir data/fbank \
-    --cuts-out-dir data/fbank_ali
+./add_alignments.sh
 (2) Set the argument "--manifest-dir data/fbank_ali" while decoding.
 For example:
 ./conformer_ctc3/decode.py \
diff --git a/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py
index b7953e5e39..832b994337 100755
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py
@@ -94,10 +94,7 @@
 
 To evaluate symbol delay, you should:
 (1) Generate cuts with word-time alignments:
-./local/add_alignment_librispeech.py \
-    --alignments-dir data/alignment \
-    --cuts-in-dir data/fbank \
-    --cuts-out-dir data/fbank_ali
+./add_alignments.sh
 (2) Set the argument "--manifest-dir data/fbank_ali" while decoding.
 For example:
 ./lstm_transducer_stateless3/decode.py \
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
index f5cbc21f77..5fa129a898 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
@@ -109,10 +109,7 @@
 
 To evaluate symbol delay, you should:
 (1) Generate cuts with word-time alignments:
-./local/add_alignment_librispeech.py \
-    --alignments-dir data/alignment \
-    --cuts-in-dir data/fbank \
-    --cuts-out-dir data/fbank_ali
+./add_alignments.sh
 (2) Set the argument "--manifest-dir data/fbank_ali" while decoding.
 For example:
 ./pruned_transducer_stateless4/decode.py \