From 562bda91e498ce5b4188c78f16131e58d0d9a2e5 Mon Sep 17 00:00:00 2001 From: Yifan Yang <64255737+yfyeung@users.noreply.github.com> Date: Wed, 17 May 2023 16:02:27 +0800 Subject: [PATCH] Add adaption recipe for pruned_transducer_stateless7 (#1059) * Add mux for finetune * Add comments * Fix for black * Update finetune.py --- egs/librispeech/ASR/finetune.sh | 1 + .../decode_gigaspeech.py | 35 ++++---- .../pruned_transducer_stateless7/finetune.py | 88 +++++++++++++------ 3 files changed, 78 insertions(+), 46 deletions(-) diff --git a/egs/librispeech/ASR/finetune.sh b/egs/librispeech/ASR/finetune.sh index 63d0966ed0..bc6357312e 100755 --- a/egs/librispeech/ASR/finetune.sh +++ b/egs/librispeech/ASR/finetune.sh @@ -79,6 +79,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then --use-averaged-model True \ --beam-size 4 \ --exp-dir pruned_transducer_stateless7/exp_giga_finetune \ + --bpe-model icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/data/lang_bpe_500/bpe.model \ --max-duration 400 \ --decoding-method $m done diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py b/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py index 4f64850b65..b0e4be0d1a 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 # -# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang, +# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang, # Zengwei Yao, -# Xiaoyu Yang) +# Xiaoyu Yang, +# Yifan Yang,) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -20,36 +21,36 @@ """ Usage: (1) greedy search -./pruned_transducer_stateless7/decode.py \ +./pruned_transducer_stateless7/decode_gigaspeech.py \ --epoch 28 \ --avg 15 \ - --exp-dir ./pruned_transducer_stateless7/exp \ + --exp-dir ./pruned_transducer_stateless7/exp_giga_finetune \ --max-duration 600 \ --decoding-method greedy_search (2) beam search (not recommended) -./pruned_transducer_stateless7/decode.py \ +./pruned_transducer_stateless7/decode_gigaspeech.py \ --epoch 28 \ --avg 15 \ - --exp-dir ./pruned_transducer_stateless7/exp \ + --exp-dir ./pruned_transducer_stateless7/exp_giga_finetune \ --max-duration 600 \ --decoding-method beam_search \ --beam-size 4 (3) modified beam search -./pruned_transducer_stateless7/decode.py \ +./pruned_transducer_stateless7/decode_gigaspeech.py \ --epoch 28 \ --avg 15 \ - --exp-dir ./pruned_transducer_stateless7/exp \ + --exp-dir ./pruned_transducer_stateless7/exp_giga_finetune \ --max-duration 600 \ --decoding-method modified_beam_search \ --beam-size 4 (4) fast beam search (one best) -./pruned_transducer_stateless7/decode.py \ +./pruned_transducer_stateless7/decode_gigaspeech.py \ --epoch 28 \ --avg 15 \ - --exp-dir ./pruned_transducer_stateless7/exp \ + --exp-dir ./pruned_transducer_stateless7/exp_giga_finetune \ --max-duration 600 \ --decoding-method fast_beam_search \ --beam 20.0 \ @@ -57,10 +58,10 @@ --max-states 64 (5) fast beam search (nbest) -./pruned_transducer_stateless7/decode.py \ +./pruned_transducer_stateless7/decode_gigaspeech.py \ --epoch 28 \ --avg 15 \ - --exp-dir ./pruned_transducer_stateless7/exp \ + --exp-dir ./pruned_transducer_stateless7/exp_giga_finetune \ --max-duration 600 \ --decoding-method fast_beam_search_nbest \ --beam 20.0 \ @@ -70,10 +71,10 @@ --nbest-scale 0.5 (6) fast beam search (nbest oracle WER) -./pruned_transducer_stateless7/decode.py \ +./pruned_transducer_stateless7/decode_gigaspeech.py \ --epoch 28 \ --avg 15 \ - --exp-dir ./pruned_transducer_stateless7/exp \ + --exp-dir ./pruned_transducer_stateless7/exp_giga_finetune \ --max-duration 600 \ --decoding-method fast_beam_search_nbest_oracle \ --beam 20.0 \ @@ -83,10 +84,10 @@ --nbest-scale 0.5 (7) fast beam search (with LG) -./pruned_transducer_stateless7/decode.py \ +./pruned_transducer_stateless7/decode_gigaspeech.py \ --epoch 28 \ --avg 15 \ - --exp-dir ./pruned_transducer_stateless7/exp \ + --exp-dir ./pruned_transducer_stateless7/exp_giga_finetune \ --max-duration 600 \ --decoding-method fast_beam_search_nbest_LG \ --beam 20.0 \ @@ -187,7 +188,7 @@ def get_parser(): parser.add_argument( "--exp-dir", type=str, - default="pruned_transducer_stateless7/exp", + default="pruned_transducer_stateless7/exp_giga_finetune", help="The experiment dir", ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py b/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py index 726a24809e..02e8bd9eb4 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 -# Copyright 2021-2022 Xiaomi Corp. (authors: Fangjun Kuang, +# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang, # Wei Kang, -# Mingshuang Luo,) -# Zengwei Yao) +# Mingshuang Luo, +# Zengwei Yao, +# Xiaoyu Yang, +# Yifan Yang) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -20,27 +22,23 @@ """ Usage: -export CUDA_VISIBLE_DEVICES="0,1,2,3" +export CUDA_VISIBLE_DEVICES="0,1" -./pruned_transducer_stateless7/train.py \ - --world-size 4 \ - --num-epochs 30 \ - --start-epoch 1 \ - --exp-dir pruned_transducer_stateless7/exp \ - --full-libri 1 \ - --max-duration 300 - -# For mix precision training: - -./pruned_transducer_stateless7/train.py \ - --world-size 4 \ - --num-epochs 30 \ +./pruned_transducer_stateless7/finetune.py \ + --world-size 2 \ + --num-epochs 20 \ --start-epoch 1 \ + --exp-dir pruned_transducer_stateless7/exp_giga_finetune \ + --subset S \ --use-fp16 1 \ - --exp-dir pruned_transducer_stateless7/exp \ - --full-libri 1 \ - --max-duration 550 - + --base-lr 0.005 \ + --lr-epochs 100 \ + --lr-batches 100000 \ + --bpe-model icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/data/lang_bpe_500/bpe.model \ + --do-finetune True \ + --use-mux True \ + --finetune-ckpt icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/exp/pretrained.pt \ + --max-duration 500 """ @@ -59,9 +57,10 @@ import torch.multiprocessing as mp import torch.nn as nn from decoder import Decoder +from asr_datamodule import LibriSpeechAsrDataModule from gigaspeech import GigaSpeechAsrDataModule from joiner import Joiner -from lhotse.cut import Cut +from lhotse.cut import Cut, CutSet from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed from model import Transducer @@ -103,7 +102,21 @@ def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None: def add_finetune_arguments(parser: argparse.ArgumentParser): - parser.add_argument("--do-finetune", type=str2bool, default=False) + parser.add_argument( + "--do-finetune", + type=str2bool, + default=True, + help="Whether to fine-tune.", + ) + parser.add_argument( + "--use-mux", + type=str2bool, + default=False, + help=""" + Whether to adapt. If true, we will mix 5% of the new data + with 95% of the original data to fine-tune. + """, + ) parser.add_argument( "--init-modules", @@ -907,7 +920,11 @@ def train_one_epoch( # NOTE: We use reduction==sum and loss is computed over utterances # in the batch and there is no normalization to it so far. scaler.scale(loss).backward() - set_batch_count(model, params.batch_idx_train) + # Skip the warmup by adding a huge number to batch_count + if params.do_finetune: + set_batch_count(model, params.batch_idx_train + 100000) + else: + set_batch_count(model, params.batch_idx_train) scheduler.step_batch(params.batch_idx_train) scaler.step(optimizer) @@ -1104,7 +1121,12 @@ def run(rank, world_size, args): parameters_names=parameters_names, ) - scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs) + scheduler = Eden( + optimizer=optimizer, + lr_batches=params.lr_batches, + lr_epochs=params.lr_epochs, + warmup_batches=0, + ) if checkpoints and "optimizer" in checkpoints: logging.info("Loading optimizer state dict") @@ -1129,7 +1151,15 @@ def run(rank, world_size, args): gigaspeech = GigaSpeechAsrDataModule(args) - train_cuts = gigaspeech.train_cuts() + if params.use_mux: + librispeech = LibriSpeechAsrDataModule(args) + train_cuts = CutSet.mux( + librispeech.train_all_shuf_cuts(), + gigaspeech.train_cuts(), + weights=[0.95, 0.05], + ) + else: + train_cuts = gigaspeech.train_cuts() def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds @@ -1141,9 +1171,9 @@ def remove_short_and_long_utt(c: Cut): # an utterance duration distribution for your dataset to select # the threshold if c.duration < 1.0 or c.duration > 20.0: - logging.warning( - f"Exclude cut with ID {c.id} from training. Duration: {c.duration}" - ) + # logging.warning( + # f"Exclude cut with ID {c.id} from training. Duration: {c.duration}" + # ) return False # In pruned RNN-T, we require that T >= S