Traning with byte level BPE (TAL_CSASR) (#1033)

* Add byte level bpe tal_csasr recipe * Minor fixes to decoding and exporting * Fix prepare.sh * Update results
k2-fsa · May 16, 2023 · bccd20d · bccd20d
1 parent 7a9f40a
commit bccd20d
Show file tree

Hide file tree

Showing 22 changed files with 3,134 additions and 32 deletions.
diff --git a/egs/tal_csasr/ASR/RESULTS.md b/egs/tal_csasr/ASR/RESULTS.md
@@ -1,5 +1,51 @@
 ## Results
 
+#### Pruned transducer stateless 7 (zipformer)
+
+See <https://github.com/k2-fsa/icefall/pull/1033>
+
+[./pruned_transducer_stateless7_bbpe](./pruned_transducer_stateless7_bbpe)
+
+**Note**: The modeling units are byte level BPEs
+
+The best results I have gotten are:
+
+Vocab size | greedy (dev & test) | modified beam search (dev & test) |  |
+-- | -- | -- | --
+500  | 6.88 & 6.98 | 6.87 & 6.94 | --epoch 35 --avg 26
+
+The training command:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./pruned_transducer_stateless7_bbpe/train.py \
+  --world-size 4 \
+  --start-epoch 1 \
+  --num-epochs 35 \
+  --use-fp16 1 \
+  --max-duration 800 \
+  --bbpe-model data/lang_bbpe_500/bbpe.model \
+  --exp-dir pruned_transducer_stateless7_bbpe/exp \
+  --master-port 12535
+```
+
+The decoding command:
+
+```
+ ./pruned_transducer_stateless7_bbpe/decode.py \
+   --epoch 35 \
+   --avg 26 \
+   --exp-dir ./pruned_transducer_stateless7_bbpe/exp \
+   --max-sym-per-frame 1 \
+   --bpe-model data/lang_bbpe_500/bbpe.model \
+   --max-duration 2000 \
+   --decoding-method greedy_search  # modified_beam_search
+```
+
+The pretrained model is available at:  https://huggingface.co/pkufool/icefall_asr_tal_csasr_pruned_transducer_stateless7_bbpe
+
+
 ### TAL_CSASR Mix Chars and BPEs training results (Pruned Transducer Stateless5)
 
 #### 2022-06-22

diff --git a/egs/tal_csasr/ASR/local/prepare_char.py b/egs/tal_csasr/ASR/local/prepare_char.py
@@ -211,8 +211,9 @@ def main():
     lang_dir = Path("data/lang_char")
     text_file = lang_dir / "text_with_bpe"
     bpe_model = lang_dir / "bpe.model"
+    words_file = lang_dir / "words.txt"
 
-    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
+    word_sym_table = k2.SymbolTable.from_file(words_file)
 
     words = word_sym_table.symbols
 

diff --git a/egs/tal_csasr/ASR/local/train_bbpe_model.py b/egs/tal_csasr/ASR/local/train_bbpe_model.py
@@ -0,0 +1 @@
+../../../aishell/ASR/local/train_bbpe_model.py
diff --git a/egs/tal_csasr/ASR/prepare.sh b/egs/tal_csasr/ASR/prepare.sh
@@ -31,6 +31,15 @@ dl_dir=$PWD/download
 
 . shared/parse_options.sh || exit 1
 
+# vocab size for sentence piece models.
+# It will generate data/lang_bbpe_xxx,
+# data/lang_bbpe_yyy if the array contains xxx, yyy
+vocab_sizes=(
+  # 2000
+  1000
+  500
+)
+
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
@@ -117,55 +126,44 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   # You can also use other BPE models if available.
   if [ ! -f $lang_char_dir/bpe.model ]; then
     wget -O $lang_char_dir/bpe.model \
-      https://huggingface.co/luomingshuang/bpe_models_trained_with_Librispeech/resolve/main/lang_bpe_5000/bpe.model
+      https://huggingface.co/luomingshuang/bpe_models_trained_with_Librispeech/resolve/main/lang_bpe_500/bpe.model
   fi
 
-  # Prepare text.
-  # Note: in Linux, you can install jq with the following command:
-  # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
-  # 2. chmod +x ./jq
-  # 3. cp jq /usr/bin
-  if [ ! -f $lang_char_dir/text_full ]; then
+  # we extract text from manifests rather than the label.txt in corpus, because
+  # the texts in manifests have been normalized in lhotse.
+  if [ ! -f $lang_char_dir/text ]; then
     gunzip -c data/manifests/tal_csasr/tal_csasr_supervisions_train_set.jsonl.gz \
-      | jq ".text" | sed 's/"//g' \
+      | grep -o 'text":\s[^,]*' | sed 's/text": "//g;s/"//g' \
       | ./local/text2token.py -t "char" > $lang_char_dir/text_train
 
     gunzip -c data/manifests/tal_csasr/tal_csasr_supervisions_dev_set.jsonl.gz \
-      | jq ".text" | sed 's/"//g' \
+      | grep -o 'text":\s[^,]*' | sed 's/text": "//g;s/"//g' \
       | ./local/text2token.py -t "char" > $lang_char_dir/text_dev
 
     gunzip -c data/manifests/tal_csasr/tal_csasr_supervisions_test_set.jsonl.gz \
-      | jq ".text" | sed 's/"//g' \
+      | grep -o 'text":\s[^,]*' | sed 's/text": "//g;s/"//g' \
       | ./local/text2token.py -t "char" > $lang_char_dir/text_test
 
     for r in text_train text_dev text_test ; do
-      cat $lang_char_dir/$r >> $lang_char_dir/text_full
+      cat $lang_char_dir/$r >> $lang_char_dir/text
     done
   fi
 
-  # Prepare text normalize
-  if [ ! -f $lang_char_dir/text ]; then
-    python ./local/text_normalize.py \
-      --input $lang_char_dir/text_full \
-      --output $lang_char_dir/text
-  fi
+  # Prepare words.txt
+  # We assume you have install jieba, if not, please install
+  # it using: pip install jieba
+  if [ ! -f $lang_char_dir/words.txt ]; then
+    python -m jieba $lang_char_dir/text | sed 's/\///g;s/\s\+/ /g' > $lang_char_dir/text.seg
 
-  # Prepare words segments
-  if [ ! -f $lang_char_dir/text_words_segmentation ]; then
-    python ./local/text2segments.py \
-      --input $lang_char_dir/text \
-      --output $lang_char_dir/text_words_segmentation
+   (echo '<eps> 0'; echo '!SIL 1'; echo '<SPOKEN_NOISE> 2'; echo '<UNK> 3';) \
+     > $lang_char_dir/words.txt
 
-    cat $lang_char_dir/text_words_segmentation | sed "s/ /\n/g" \
-      | sort -u | sed "/^$/d" \
-      | uniq > $lang_char_dir/words_no_ids.txt
-  fi
+   cat $lang_char_dir/text.seg | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
+      | awk '{print $1" "NR+3}' >> $lang_char_dir/words.txt
 
-  # Prepare words.txt
-  if [ ! -f $lang_char_dir/words.txt ]; then
-    ./local/prepare_words.py \
-      --input $lang_char_dir/words_no_ids.txt \
-      --output $lang_char_dir/words.txt
+   num_lines=$(< $lang_char_dir/words.txt wc -l)
+    (echo "#0 $num_lines"; echo "<s> $(($num_lines + 1))"; echo "</s> $(($num_lines + 2))";) \
+      >> $lang_char_dir/words.txt
   fi
 
   # Tokenize text with BPE model
@@ -178,3 +176,23 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
     python local/prepare_char.py
   fi
 fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 7: Prepare Byte BPE based lang"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/lang_bbpe_${vocab_size}
+    mkdir -p $lang_dir
+    # We reuse words.txt from phone based lexicon
+    # so that the two can share G.pt later.
+    cp $lang_char_dir/words.txt $lang_dir
+    cp $lang_char_dir/text $lang_dir
+
+    if [ ! -f $lang_dir/bbpe.model ]; then
+      ./local/train_bbpe_model.py \
+        --lang-dir $lang_dir \
+        --vocab-size $vocab_size \
+        --transcript $lang_dir/text
+    fi
+  done
+fi
diff --git a/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/__init__.py b/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/__init__.py
diff --git a/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/asr_datamodule.py b/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/asr_datamodule.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless5/asr_datamodule.py
diff --git a/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/beam_search.py b/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/beam_search.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless5/beam_search.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../pruned_transducer_stateless5/asr_datamodule.py