diff --git a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py index 9088378fae..fa809b768a 100644 --- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py +++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py @@ -67,7 +67,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -638,7 +638,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -843,7 +843,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -912,7 +912,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/pruned_transducer_stateless2/train.py b/egs/aishell/ASR/pruned_transducer_stateless2/train.py index dda098e993..60f014c48d 100755 --- a/egs/aishell/ASR/pruned_transducer_stateless2/train.py +++ b/egs/aishell/ASR/pruned_transducer_stateless2/train.py @@ -60,7 +60,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -688,7 +688,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -888,7 +888,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -989,7 +989,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/pruned_transducer_stateless3/model.py b/egs/aishell/ASR/pruned_transducer_stateless3/model.py index cafc9d1bb3..a4dda0d6d5 100644 --- a/egs/aishell/ASR/pruned_transducer_stateless3/model.py +++ b/egs/aishell/ASR/pruned_transducer_stateless3/model.py @@ -184,7 +184,7 @@ def forward( lm = simple_lm_proj(decoder_out) am = simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -219,7 +219,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/aishell/ASR/pruned_transducer_stateless3/train.py b/egs/aishell/ASR/pruned_transducer_stateless3/train.py index bf60c4fadd..7c23041cad 100755 --- a/egs/aishell/ASR/pruned_transducer_stateless3/train.py +++ b/egs/aishell/ASR/pruned_transducer_stateless3/train.py @@ -79,7 +79,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -797,7 +797,7 @@ def train_one_epoch( aishell = is_aishell(batch["supervisions"]["cut"][0]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1096,7 +1096,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1202,7 +1202,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py b/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py index 9a9d92c20b..058d0ff6ba 100755 --- a/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py +++ b/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py @@ -74,7 +74,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -812,7 +812,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1107,7 +1107,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1206,7 +1206,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/train.py b/egs/aishell/ASR/pruned_transducer_stateless7/train.py index ede2bd3e5a..2dc835f3b3 100755 --- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py +++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py @@ -70,7 +70,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -809,7 +809,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1107,7 +1107,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1206,7 +1206,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py index be48d6ddef..811269989e 100755 --- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py +++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py @@ -64,7 +64,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -802,7 +802,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1102,7 +1102,7 @@ def tokenize_and_encode_text(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1202,7 +1202,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py index e3387e6706..6653d9d9cf 100755 --- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py +++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py @@ -63,7 +63,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer_for_ncnn_export_only import Zipformer @@ -813,7 +813,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1105,7 +1105,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1205,7 +1205,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py index cba3122141..f3b0f1e113 100755 --- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py +++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py @@ -63,7 +63,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -812,7 +812,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1104,7 +1104,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1202,7 +1202,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/whisper/train.py b/egs/aishell/ASR/whisper/train.py index e84dcf1565..d77f8c270a 100755 --- a/egs/aishell/ASR/whisper/train.py +++ b/egs/aishell/ASR/whisper/train.py @@ -62,7 +62,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.functional import pad as pad_tensor from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -514,7 +514,7 @@ def compute_validation_loss( tot_loss = MetricsTracker() for batch_idx, batch in enumerate(valid_dl): - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, @@ -608,7 +608,7 @@ def train_one_epoch( ) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, @@ -812,7 +812,7 @@ def run(rank, world_size, args): train_dl = aishell.train_dataloaders(aishell.train_cuts()) valid_dl = aishell.valid_dataloaders(aishell.valid_cuts()) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/aishell/ASR/zipformer/train.py b/egs/aishell/ASR/zipformer/train.py index ab568b20f6..cd253c5970 100755 --- a/egs/aishell/ASR/zipformer/train.py +++ b/egs/aishell/ASR/zipformer/train.py @@ -71,7 +71,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -910,7 +910,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1201,7 +1201,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1302,7 +1302,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell/ASR/zipformer/train_bbpe.py b/egs/aishell/ASR/zipformer/train_bbpe.py index 2dac0cc64d..46a5506db8 100755 --- a/egs/aishell/ASR/zipformer/train_bbpe.py +++ b/egs/aishell/ASR/zipformer/train_bbpe.py @@ -61,7 +61,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from train import ( @@ -495,7 +495,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -795,7 +795,7 @@ def tokenize_and_encode_text(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -895,7 +895,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell2/ASR/pruned_transducer_stateless5/train.py b/egs/aishell2/ASR/pruned_transducer_stateless5/train.py index 772d9e6bf1..8c7448d4c8 100755 --- a/egs/aishell2/ASR/pruned_transducer_stateless5/train.py +++ b/egs/aishell2/ASR/pruned_transducer_stateless5/train.py @@ -75,7 +75,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -734,7 +734,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -963,7 +963,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1062,7 +1062,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/aishell4/ASR/pruned_transducer_stateless5/train.py b/egs/aishell4/ASR/pruned_transducer_stateless5/train.py index 0eb9271f59..a354f761e5 100755 --- a/egs/aishell4/ASR/pruned_transducer_stateless5/train.py +++ b/egs/aishell4/ASR/pruned_transducer_stateless5/train.py @@ -68,7 +68,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -727,7 +727,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) # print(batch["supervisions"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -963,7 +963,7 @@ def text_normalize_for_cut(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1034,7 +1034,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py b/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py index 2b1b6f9b48..30154291df 100644 --- a/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py +++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py @@ -67,7 +67,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -638,7 +638,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -843,7 +843,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -912,7 +912,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py index e321deeb10..30879d8d24 100755 --- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py +++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py @@ -55,7 +55,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -782,7 +782,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1031,7 +1031,7 @@ def run(rank, world_size, args): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1127,7 +1127,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/ami/ASR/pruned_transducer_stateless7/train.py b/egs/ami/ASR/pruned_transducer_stateless7/train.py index 97ebc5bcfd..d62cdadb75 100755 --- a/egs/ami/ASR/pruned_transducer_stateless7/train.py +++ b/egs/ami/ASR/pruned_transducer_stateless7/train.py @@ -55,7 +55,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -773,7 +773,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1034,7 +1034,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1134,7 +1134,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/ami/SURT/dprnn_zipformer/train.py b/egs/ami/SURT/dprnn_zipformer/train.py index 9e77c05278..adc6a84954 100755 --- a/egs/ami/SURT/dprnn_zipformer/train.py +++ b/egs/ami/SURT/dprnn_zipformer/train.py @@ -61,7 +61,7 @@ from optim import Eden, ScaledAdam from scaling import ScaledLinear, ScaledLSTM from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -1067,7 +1067,7 @@ def train_one_epoch( batch_size = batch["inputs"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1314,7 +1314,7 @@ def run(rank, world_size, args): ) valid_dl = ami.valid_dataloaders(dev_cuts) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/ami/SURT/dprnn_zipformer/train_adapt.py b/egs/ami/SURT/dprnn_zipformer/train_adapt.py index 0647a7c787..ac5b0dadc7 100755 --- a/egs/ami/SURT/dprnn_zipformer/train_adapt.py +++ b/egs/ami/SURT/dprnn_zipformer/train_adapt.py @@ -61,7 +61,7 @@ from optim import Eden, ScaledAdam from scaling import ScaledLinear, ScaledLSTM from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -1058,7 +1058,7 @@ def train_one_epoch( batch_size = batch["inputs"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1305,7 +1305,7 @@ def run(rank, world_size, args): ) valid_dl = ami.valid_dataloaders(dev_cuts) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/audioset/AT/zipformer/train.py b/egs/audioset/AT/zipformer/train.py index 9532ed906b..67c7033642 100644 --- a/egs/audioset/AT/zipformer/train.py +++ b/egs/audioset/AT/zipformer/train.py @@ -53,7 +53,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -799,7 +799,7 @@ def save_bad_model(suffix: str = ""): num_samples += batch_size try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1057,7 +1057,7 @@ def remove_short_and_long_utt(c: Cut): valid_cuts = audioset.audioset_eval_cuts() valid_dl = audioset.valid_dataloaders(valid_cuts) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1148,7 +1148,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py b/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py index 486ab73dff..5e98084ec0 100755 --- a/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py +++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -825,7 +825,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1120,7 +1120,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1220,7 +1220,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py index fa241abe77..aefe88f3f5 100755 --- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py +++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py @@ -65,7 +65,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer_for_ncnn_export_only import Zipformer @@ -818,7 +818,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1109,7 +1109,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1209,7 +1209,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py index 8905dc6172..976004ecaa 100755 --- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py +++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py @@ -68,7 +68,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -895,7 +895,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1193,7 +1193,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1293,7 +1293,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py index 8260c49858..67e1a81334 100755 --- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py +++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py @@ -65,7 +65,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -840,7 +840,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1137,7 +1137,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1237,7 +1237,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/commonvoice/ASR/zipformer/train.py b/egs/commonvoice/ASR/zipformer/train.py index c0219df193..271014db0c 100755 --- a/egs/commonvoice/ASR/zipformer/train.py +++ b/egs/commonvoice/ASR/zipformer/train.py @@ -75,7 +75,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -969,7 +969,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1265,7 +1265,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1365,7 +1365,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/commonvoice/ASR/zipformer/train_char.py b/egs/commonvoice/ASR/zipformer/train_char.py index 639e1067a5..0aa7856ccc 100755 --- a/egs/commonvoice/ASR/zipformer/train_char.py +++ b/egs/commonvoice/ASR/zipformer/train_char.py @@ -67,7 +67,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from train import ( @@ -604,7 +604,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -784,7 +784,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, @@ -979,7 +979,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/csj/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py b/egs/csj/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py index 661bfa6ca4..6d256308cb 100755 --- a/egs/csj/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py +++ b/egs/csj/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py @@ -67,7 +67,7 @@ from optim import Eden, ScaledAdam from tokenizer import Tokenizer from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer_for_ncnn_export_only import Zipformer @@ -839,7 +839,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1146,7 +1146,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1246,7 +1246,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py b/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py index 8f07fc42fc..ef7ea90138 100755 --- a/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py +++ b/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py @@ -67,7 +67,7 @@ from optim import Eden, ScaledAdam from tokenizer import Tokenizer from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -838,7 +838,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1145,7 +1145,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1245,7 +1245,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py b/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py index e0e11fc70a..a7772b62f4 100755 --- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py @@ -64,7 +64,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -675,7 +675,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -873,7 +873,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -944,7 +944,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/gigaspeech/ASR/zipformer/train.py b/egs/gigaspeech/ASR/zipformer/train.py index 5092ef8cbd..4c122effee 100755 --- a/egs/gigaspeech/ASR/zipformer/train.py +++ b/egs/gigaspeech/ASR/zipformer/train.py @@ -75,7 +75,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -958,7 +958,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1217,7 +1217,7 @@ def remove_short_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1317,7 +1317,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/gigaspeech/KWS/zipformer/finetune.py b/egs/gigaspeech/KWS/zipformer/finetune.py index 49e8aef1a3..a7ba561276 100755 --- a/egs/gigaspeech/KWS/zipformer/finetune.py +++ b/egs/gigaspeech/KWS/zipformer/finetune.py @@ -73,7 +73,7 @@ from model import AsrModel from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from train import ( @@ -291,7 +291,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -570,7 +570,7 @@ def remove_short_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/gigaspeech/KWS/zipformer/train.py b/egs/gigaspeech/KWS/zipformer/train.py index f2283cb036..39d8fc6cd9 100755 --- a/egs/gigaspeech/KWS/zipformer/train.py +++ b/egs/gigaspeech/KWS/zipformer/train.py @@ -75,7 +75,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -961,7 +961,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1220,7 +1220,7 @@ def remove_short_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1320,7 +1320,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py index 30d9f0e515..bf50bf5ea0 100755 --- a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py +++ b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py @@ -61,7 +61,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -805,7 +805,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1096,7 +1096,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1196,7 +1196,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/ksponspeech/ASR/zipformer/train.py b/egs/ksponspeech/ASR/zipformer/train.py index 5f6ee7ccaa..485ea69c96 100755 --- a/egs/ksponspeech/ASR/zipformer/train.py +++ b/egs/ksponspeech/ASR/zipformer/train.py @@ -70,7 +70,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -942,7 +942,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1233,7 +1233,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1333,7 +1333,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/libricss/SURT/dprnn_zipformer/model.py b/egs/libricss/SURT/dprnn_zipformer/model.py index 0e88357d19..688e1e78d2 100644 --- a/egs/libricss/SURT/dprnn_zipformer/model.py +++ b/egs/libricss/SURT/dprnn_zipformer/model.py @@ -140,7 +140,7 @@ def forward_helper( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -175,7 +175,7 @@ def forward_helper( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/libricss/SURT/dprnn_zipformer/scaling.py b/egs/libricss/SURT/dprnn_zipformer/scaling.py index d46cb224e0..4040a7b89a 100644 --- a/egs/libricss/SURT/dprnn_zipformer/scaling.py +++ b/egs/libricss/SURT/dprnn_zipformer/scaling.py @@ -287,7 +287,7 @@ def forward(ctx, x: Tensor, dim: int): @staticmethod def backward(ctx, ans_grad: Tensor): (ans,) = ctx.saved_tensors - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): ans_grad = ans_grad.to(torch.float32) ans = ans.to(torch.float32) x_grad = ans_grad * ans @@ -1065,7 +1065,7 @@ def forward( def backward(ctx, x_grad: Tensor): (x_orig,) = ctx.saved_tensors with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x_detached = x_orig.to(torch.float32).detach() x_detached.requires_grad = True @@ -1263,7 +1263,7 @@ def forward(self, x: Tensor) -> Tensor: ): return _no_op(x) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): eps = 1.0e-20 orig_x = x x = x.to(torch.float32) diff --git a/egs/libricss/SURT/dprnn_zipformer/train.py b/egs/libricss/SURT/dprnn_zipformer/train.py index 33ea7c5a66..148cafd4b7 100755 --- a/egs/libricss/SURT/dprnn_zipformer/train.py +++ b/egs/libricss/SURT/dprnn_zipformer/train.py @@ -69,7 +69,7 @@ from optim import Eden, ScaledAdam from scaling import ScaledLSTM from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -1096,7 +1096,7 @@ def train_one_epoch( batch_size = batch["inputs"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1343,7 +1343,7 @@ def run(rank, world_size, args): train_dl_ov40 = libricss.train_dataloaders(train_cuts_ov40) valid_dl = libricss.valid_dataloaders(dev_cuts) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/libricss/SURT/dprnn_zipformer/train_adapt.py b/egs/libricss/SURT/dprnn_zipformer/train_adapt.py index 82b61baa01..8c37430ec8 100755 --- a/egs/libricss/SURT/dprnn_zipformer/train_adapt.py +++ b/egs/libricss/SURT/dprnn_zipformer/train_adapt.py @@ -67,7 +67,7 @@ from optim import Eden, ScaledAdam from scaling import ScaledLinear, ScaledLSTM from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -985,7 +985,7 @@ def train_one_epoch( batch_size = batch["inputs"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1237,7 +1237,7 @@ def run(rank, world_size, args): ) valid_dl = libricss.valid_dataloaders(dev_cuts) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/libriheavy/ASR/zipformer/train.py b/egs/libriheavy/ASR/zipformer/train.py index 524273ec5f..357e8a827f 100644 --- a/egs/libriheavy/ASR/zipformer/train.py +++ b/egs/libriheavy/ASR/zipformer/train.py @@ -78,7 +78,7 @@ from subsampling import Conv2dSubsampling from text_normalization import remove_punc_to_upper from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -958,7 +958,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1268,7 +1268,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1367,7 +1367,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/model_baseline.py b/egs/libriheavy/ASR/zipformer_prompt_asr/model_baseline.py index 66328bb892..77b4057c47 100644 --- a/egs/libriheavy/ASR/zipformer_prompt_asr/model_baseline.py +++ b/egs/libriheavy/ASR/zipformer_prompt_asr/model_baseline.py @@ -186,7 +186,7 @@ def forward( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -221,7 +221,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/model_with_BERT.py b/egs/libriheavy/ASR/zipformer_prompt_asr/model_with_BERT.py index 80fbf09f03..21c7b4facd 100644 --- a/egs/libriheavy/ASR/zipformer_prompt_asr/model_with_BERT.py +++ b/egs/libriheavy/ASR/zipformer_prompt_asr/model_with_BERT.py @@ -245,7 +245,7 @@ def forward( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -287,7 +287,7 @@ def forward( logits = self.joiner(am_pruned, lm_pruned, context=context, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/scaling.py b/egs/libriheavy/ASR/zipformer_prompt_asr/scaling.py index a260d828e4..0e6764ba05 100644 --- a/egs/libriheavy/ASR/zipformer_prompt_asr/scaling.py +++ b/egs/libriheavy/ASR/zipformer_prompt_asr/scaling.py @@ -271,7 +271,7 @@ def forward(ctx, x: Tensor, dim: int): @staticmethod def backward(ctx, ans_grad: Tensor): (ans,) = ctx.saved_tensors - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): ans_grad = ans_grad.to(torch.float32) ans = ans.to(torch.float32) x_grad = ans_grad * ans @@ -685,7 +685,7 @@ def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None, None, None] try: with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x = x.to(torch.float32) x = x.detach() x.requires_grad = True @@ -940,7 +940,7 @@ def backward(ctx, x_grad: Tensor): try: with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x_detached = x_orig.to(torch.float32).detach() x_detached.requires_grad = True @@ -1280,7 +1280,7 @@ def forward(ctx, x: Tensor) -> Tensor: coeff = -0.08 - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): with torch.enable_grad(): x = x.detach() x.requires_grad = True @@ -1351,7 +1351,7 @@ def forward(ctx, x: Tensor) -> Tensor: zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): with torch.enable_grad(): x = x.detach() x.requires_grad = True diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py b/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py index bfca5a0db6..93f7e12484 100644 --- a/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py +++ b/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py @@ -89,7 +89,7 @@ from subsampling import Conv2dSubsampling from text_normalization import train_text_normalization, upper_only_alpha from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -975,7 +975,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1271,7 +1271,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1371,7 +1371,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/train_bert_encoder.py b/egs/libriheavy/ASR/zipformer_prompt_asr/train_bert_encoder.py index 36c6d6464a..2a2c206aaa 100755 --- a/egs/libriheavy/ASR/zipformer_prompt_asr/train_bert_encoder.py +++ b/egs/libriheavy/ASR/zipformer_prompt_asr/train_bert_encoder.py @@ -103,7 +103,7 @@ upper_only_alpha, ) from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -1321,7 +1321,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1647,7 +1647,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1749,7 +1749,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py b/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py index 405c95acc2..d1cf90ffbb 100644 --- a/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py +++ b/egs/libriheavy/ASR/zipformer_prompt_asr/zipformer.py @@ -1561,7 +1561,7 @@ def _print_attn_entropy(self, attn_weights: Tensor): (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_weights_entropy = ( -((attn_weights + 1.0e-20).log() * attn_weights) @@ -1844,7 +1844,7 @@ def _print_attn_entropy(self, attn_weights: Tensor): (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_weights_entropy = ( -((attn_weights + 1.0e-20).log() * attn_weights) diff --git a/egs/librilight/SSL/zipformer/finetune.py b/egs/librilight/SSL/zipformer/finetune.py index 568096c6ac..50dbd5f2d7 100644 --- a/egs/librilight/SSL/zipformer/finetune.py +++ b/egs/librilight/SSL/zipformer/finetune.py @@ -67,7 +67,7 @@ from model import AsrModel from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -1116,7 +1116,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1407,7 +1407,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1505,7 +1505,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librilight/SSL/zipformer/pretrain.py b/egs/librilight/SSL/zipformer/pretrain.py index 019f77ea33..5728dbe752 100644 --- a/egs/librilight/SSL/zipformer/pretrain.py +++ b/egs/librilight/SSL/zipformer/pretrain.py @@ -57,7 +57,7 @@ from optim import Eden, ScaledAdam from ssl_datamodule import LibriLightDataModule from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -936,7 +936,7 @@ def save_bad_model(suffix: str = ""): batch_size = batch["kmeans"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1229,7 +1229,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1320,7 +1320,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/conformer_ctc2/train.py b/egs/librispeech/ASR/conformer_ctc2/train.py index b0b5da1c0c..c4a13b1017 100755 --- a/egs/librispeech/ASR/conformer_ctc2/train.py +++ b/egs/librispeech/ASR/conformer_ctc2/train.py @@ -65,7 +65,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -676,7 +676,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -965,7 +965,7 @@ def remove_invalid_utt_ctc(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1036,7 +1036,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/conformer_ctc3/train.py b/egs/librispeech/ASR/conformer_ctc3/train.py index 7e819a2d87..a2f1125ca7 100755 --- a/egs/librispeech/ASR/conformer_ctc3/train.py +++ b/egs/librispeech/ASR/conformer_ctc3/train.py @@ -76,7 +76,7 @@ from model import CTCModel from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -743,7 +743,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1004,7 +1004,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1073,7 +1073,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py index 130a7c97f1..ca21bd6bf8 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py @@ -80,7 +80,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -772,7 +772,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1002,7 +1002,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1071,7 +1071,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/do_not_use_it_directly.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/do_not_use_it_directly.py index 16ae4e4e24..d614f0914e 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/do_not_use_it_directly.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/do_not_use_it_directly.py @@ -80,7 +80,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -774,7 +774,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1003,7 +1003,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1074,7 +1074,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py index 28d094a768..23ddb6bec8 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py @@ -80,7 +80,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -772,7 +772,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1001,7 +1001,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1072,7 +1072,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/model.py b/egs/librispeech/ASR/lstm_transducer_stateless/model.py index 1ec9a8fc65..e7bad7ed83 100644 --- a/egs/librispeech/ASR/lstm_transducer_stateless/model.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless/model.py @@ -156,7 +156,7 @@ def forward( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -192,7 +192,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/lstm_transducer_stateless/train.py b/egs/librispeech/ASR/lstm_transducer_stateless/train.py index 1e50ce0909..feb81d5001 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -763,7 +763,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1023,7 +1023,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1092,7 +1092,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/model.py b/egs/librispeech/ASR/lstm_transducer_stateless2/model.py index a758c550d0..4957d14b11 100644 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/model.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/model.py @@ -195,7 +195,7 @@ def forward( lm = simple_lm_proj(decoder_out) am = simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -231,7 +231,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/train.py b/egs/librispeech/ASR/lstm_transducer_stateless2/train.py index 4d4f3e1325..4fc4fa7f83 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/train.py @@ -74,7 +74,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -848,7 +848,7 @@ def train_one_epoch( libri = is_libri(batch["supervisions"]["cut"][0]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1176,7 +1176,7 @@ def run(rank, world_size, args): else: logging.info("Skip scan_pessimistic_batches_for_oom") - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1247,7 +1247,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/lstm_transducer_stateless3/train.py b/egs/librispeech/ASR/lstm_transducer_stateless3/train.py index ae4cd1c6a2..2c1cef3a34 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless3/train.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless3/train.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -793,7 +793,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1067,7 +1067,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1136,7 +1136,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned2_knowledge/model.py b/egs/librispeech/ASR/pruned2_knowledge/model.py index 2ffea06e7a..ca8c28af17 100644 --- a/egs/librispeech/ASR/pruned2_knowledge/model.py +++ b/egs/librispeech/ASR/pruned2_knowledge/model.py @@ -141,7 +141,7 @@ def forward( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -176,7 +176,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned2_knowledge/sampling.py b/egs/librispeech/ASR/pruned2_knowledge/sampling.py index 3d2fdd6d8f..5b595c76c9 100644 --- a/egs/librispeech/ASR/pruned2_knowledge/sampling.py +++ b/egs/librispeech/ASR/pruned2_knowledge/sampling.py @@ -10,7 +10,7 @@ import torch from scaling import ScaledLinear from torch import Tensor, nn -from torch.amp import GradScaler, custom_bwd, custom_fwd +from torch.cuda.amp import GradScaler, custom_bwd, custom_fwd from torch_scheduled_sampling import sample_combined # The main exports of this file are the module KnowledgeBaseLookup and the @@ -330,14 +330,14 @@ def _test_knowledge_base_lookup_autocast(): optimizer = Eve(m.parameters(), lr=0.005, eps=1.0e-04) m = m.to(device) - scaler = GradScaler("cuda", enabled=True) + scaler = GradScaler(enabled=True) start = timeit.default_timer() for epoch in range(150): for n, (x, y) in enumerate(train_pairs): y_out = m(x) - with torch.amp.autocast("cuda", enabled=True): + with torch.cuda.amp.autocast(enabled=True): loss = ((y_out - y) ** 2).mean() * 100.0 if n % 10 == 0 and epoch % 10 == 0: print(f"Epoch {epoch}, batch {n}, loss {loss.item()}") diff --git a/egs/librispeech/ASR/pruned2_knowledge/train.py b/egs/librispeech/ASR/pruned2_knowledge/train.py index 8c117dd605..931341cc44 100755 --- a/egs/librispeech/ASR/pruned2_knowledge/train.py +++ b/egs/librispeech/ASR/pruned2_knowledge/train.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -650,7 +650,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -868,7 +868,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -937,7 +937,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py index b25a84a6b3..2b872f1d50 100755 --- a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py +++ b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py @@ -55,7 +55,7 @@ from model import Transducer from noam import Noam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -693,7 +693,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -939,7 +939,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1004,7 +1004,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py index 59ed8310cf..272d06c37d 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py @@ -157,7 +157,7 @@ def forward( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -193,7 +193,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index e86ec80520..6c19f2cb0c 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -78,7 +78,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -759,7 +759,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1000,7 +1000,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 0 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1067,7 +1067,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/model.py b/egs/librispeech/ASR/pruned_transducer_stateless3/model.py index 0495c8a292..d45f6dadc9 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/model.py @@ -195,7 +195,7 @@ def forward( lm = simple_lm_proj(decoder_out) am = simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -231,7 +231,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py index 8ef207518d..fdafa5a873 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py @@ -74,7 +74,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -827,7 +827,7 @@ def train_one_epoch( libri = is_libri(batch["supervisions"]["cut"][0]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1126,7 +1126,7 @@ def run(rank, world_size, args): warmup=0.0 if params.start_epoch == 0 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1195,7 +1195,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py index b6682908be..875b03f7f7 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py @@ -80,7 +80,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -789,7 +789,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1047,7 +1047,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1116,7 +1116,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py index 2b559a27c0..66dc5f991f 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py @@ -68,7 +68,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -814,7 +814,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1078,7 +1078,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1147,7 +1147,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/model.py b/egs/librispeech/ASR/pruned_transducer_stateless6/model.py index 20b730a08d..daadb70c98 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless6/model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless6/model.py @@ -185,7 +185,7 @@ def forward( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -220,7 +220,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py index 93663505a0..8f033cb9ad 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py @@ -80,7 +80,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -781,7 +781,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1039,7 +1039,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1108,7 +1108,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py b/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py index d29010a239..e7546ec453 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -903,7 +903,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1219,7 +1219,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1319,7 +1319,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/model.py b/egs/librispeech/ASR/pruned_transducer_stateless7/model.py index 49076b96f3..add0e6a18e 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/model.py @@ -150,7 +150,7 @@ def forward( # if self.training and random.random() < 0.25: # am = penalize_abs_values_gt(am, 30.0, 1.0e-04) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -185,7 +185,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py index 16d86fe2d7..30a7370619 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py @@ -289,7 +289,7 @@ def forward(ctx, x: Tensor, dim: int): @staticmethod def backward(ctx, ans_grad: Tensor): (ans,) = ctx.saved_tensors - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): ans_grad = ans_grad.to(torch.float32) ans = ans.to(torch.float32) x_grad = ans_grad * ans @@ -669,7 +669,7 @@ def forward( def backward(ctx, x_grad: Tensor): (x_orig,) = ctx.saved_tensors with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x_detached = x_orig.to(torch.float32).detach() x_detached.requires_grad = True @@ -867,7 +867,7 @@ def forward(self, x: Tensor) -> Tensor: ): return _no_op(x) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): eps = 1.0e-20 orig_x = x x = x.to(torch.float32) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index 91fccd58d3..436ec53b41 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -67,7 +67,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -809,7 +809,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1106,7 +1106,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1206,7 +1206,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index ebef7e9776..cbde2a2e4d 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -1421,7 +1421,7 @@ def _print_attn_stats(self, attn_weights: Tensor, attn_output: Tensor): bsz = n // num_heads with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_output = attn_output.to(torch.float32) attn_weights_entropy = ( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/model.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/model.py index 0224c15d7e..a6e919e2fb 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/model.py @@ -150,7 +150,7 @@ def forward( lm = self.simple_lm_proj(decoder_out) am = self.simple_am_proj(encoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -185,7 +185,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/train.py index 395b07b05d..b35e56abc9 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/train.py @@ -67,7 +67,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -833,7 +833,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1128,7 +1128,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1228,7 +1228,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/model.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/model.py index 4675697c14..0582b289fc 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/model.py @@ -178,7 +178,7 @@ def forward( am = self.simple_am_proj(encoder_out_fr) lm = self.simple_lm_proj(decoder_out) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -213,7 +213,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py index a431b278d2..c2d877a93f 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py @@ -63,7 +63,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -822,7 +822,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1118,7 +1118,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1217,7 +1217,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py index dc34934250..8e239e322e 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer_for_ncnn_export_only import Zipformer @@ -811,7 +811,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1106,7 +1106,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1206,7 +1206,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py index a8f47d9412..8bd00bbefc 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -810,7 +810,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1124,7 +1124,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1224,7 +1224,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer.py index e3b8b37259..c7e45564fd 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer.py @@ -2408,7 +2408,7 @@ def _print_attn_stats(self, attn_weights: Tensor, attn_output: Tensor): bsz = n // num_heads with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_output = attn_output.to(torch.float32) attn_weights_entropy = ( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer_for_ncnn_export_only.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer_for_ncnn_export_only.py index ff23725b7e..5284ed6270 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer_for_ncnn_export_only.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer_for_ncnn_export_only.py @@ -2708,7 +2708,7 @@ def _print_attn_stats(self, attn_weights: Tensor, attn_output: Tensor): bsz = n // num_heads with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_output = attn_output.to(torch.float32) attn_weights_entropy = ( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming_multi/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming_multi/train.py index 4c8c239a10..da5e144c93 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming_multi/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming_multi/train.py @@ -70,7 +70,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -866,7 +866,7 @@ def train_one_epoch( libri = is_libri(batch["supervisions"]["cut"][0]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1218,7 +1218,7 @@ def run(rank, world_size, args): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1320,7 +1320,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless8/model.py b/egs/librispeech/ASR/pruned_transducer_stateless8/model.py index c0b9113b7a..39a3607968 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless8/model.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless8/model.py @@ -172,7 +172,7 @@ def forward( # if self.training and random.random() < 0.25: # am = penalize_abs_values_gt(am, 30.0, 1.0e-04) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -207,7 +207,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/pruned_transducer_stateless8/train.py b/egs/librispeech/ASR/pruned_transducer_stateless8/train.py index 0ccef210ec..646f30ca16 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless8/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless8/train.py @@ -75,7 +75,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -866,7 +866,7 @@ def train_one_epoch( libri = is_libri(batch["supervisions"]["cut"][0]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1219,7 +1219,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1321,7 +1321,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/tiny_transducer_ctc/train.py b/egs/librispeech/ASR/tiny_transducer_ctc/train.py index 0536e89b30..1bfd071de1 100644 --- a/egs/librispeech/ASR/tiny_transducer_ctc/train.py +++ b/egs/librispeech/ASR/tiny_transducer_ctc/train.py @@ -51,7 +51,7 @@ from lhotse.utils import fix_random_seed from model import Transducer from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import AdamW from torch.optim.lr_scheduler import StepLR @@ -809,7 +809,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1092,7 +1092,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1198,7 +1198,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/zipformer/finetune.py b/egs/librispeech/ASR/zipformer/finetune.py index 5da903d382..2ff6319140 100755 --- a/egs/librispeech/ASR/zipformer/finetune.py +++ b/egs/librispeech/ASR/zipformer/finetune.py @@ -78,7 +78,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -1049,7 +1049,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1373,7 +1373,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1474,7 +1474,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/zipformer/model.py b/egs/librispeech/ASR/zipformer/model.py index b0bb7c7fe4..c7dbe1e0ad 100644 --- a/egs/librispeech/ASR/zipformer/model.py +++ b/egs/librispeech/ASR/zipformer/model.py @@ -285,7 +285,7 @@ def forward_transducer( # if self.training and random.random() < 0.25: # am = penalize_abs_values_gt(am, 30.0, 1.0e-04) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -320,7 +320,7 @@ def forward_transducer( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/ASR/zipformer/scaling.py b/egs/librispeech/ASR/zipformer/scaling.py index 46df86bf73..d345c29316 100644 --- a/egs/librispeech/ASR/zipformer/scaling.py +++ b/egs/librispeech/ASR/zipformer/scaling.py @@ -306,7 +306,7 @@ def forward(ctx, x: Tensor, dim: int): @staticmethod def backward(ctx, ans_grad: Tensor): (ans,) = ctx.saved_tensors - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): ans_grad = ans_grad.to(torch.float32) ans = ans.to(torch.float32) x_grad = ans_grad * ans @@ -759,7 +759,7 @@ def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None, None, None] try: with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x = x.to(torch.float32) x = x.detach() x.requires_grad = True @@ -1014,7 +1014,7 @@ def backward(ctx, x_grad: Tensor): try: with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x_detached = x_orig.to(torch.float32).detach() x_detached.requires_grad = True @@ -1353,7 +1353,7 @@ def forward(ctx, x: Tensor) -> Tensor: coeff = -0.08 - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): with torch.enable_grad(): x = x.detach() x.requires_grad = True @@ -1430,7 +1430,7 @@ def forward(ctx, x: Tensor) -> Tensor: zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): with torch.enable_grad(): x = x.detach() x.requires_grad = True diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py index 71d045ea09..c074c32ec7 100755 --- a/egs/librispeech/ASR/zipformer/train.py +++ b/egs/librispeech/ASR/zipformer/train.py @@ -79,7 +79,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -1101,7 +1101,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", + with torch.cuda.amp.autocast( enabled=params.use_autocast, dtype=params.dtype ): loss, loss_info = compute_loss( @@ -1438,7 +1438,7 @@ def remove_short_and_long_utt(c: Cut): spec_augment=spec_augment, ) - scaler = GradScaler("cuda", enabled=params.use_autocast, init_scale=1.0) + scaler = GradScaler(enabled=params.use_autocast, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1540,7 +1540,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", + with torch.cuda.amp.autocast( enabled=params.use_autocast, dtype=params.dtype ): loss, _ = compute_loss( diff --git a/egs/librispeech/ASR/zipformer/zipformer.py b/egs/librispeech/ASR/zipformer/zipformer.py index bdfd2175c2..2a0ae01297 100644 --- a/egs/librispeech/ASR/zipformer/zipformer.py +++ b/egs/librispeech/ASR/zipformer/zipformer.py @@ -1873,7 +1873,7 @@ def _print_attn_entropy(self, attn_weights: Tensor): (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_weights_entropy = ( -((attn_weights + 1.0e-20).log() * attn_weights) diff --git a/egs/librispeech/ASR/zipformer_adapter/train.py b/egs/librispeech/ASR/zipformer_adapter/train.py index 0207fc26e1..3511590da8 100755 --- a/egs/librispeech/ASR/zipformer_adapter/train.py +++ b/egs/librispeech/ASR/zipformer_adapter/train.py @@ -67,7 +67,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -1052,7 +1052,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1397,7 +1397,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1498,7 +1498,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/zipformer_adapter/zipformer.py b/egs/librispeech/ASR/zipformer_adapter/zipformer.py index 6224d136a6..8e2dfdd72c 100644 --- a/egs/librispeech/ASR/zipformer_adapter/zipformer.py +++ b/egs/librispeech/ASR/zipformer_adapter/zipformer.py @@ -1916,7 +1916,7 @@ def _print_attn_entropy(self, attn_weights: Tensor): (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_weights_entropy = ( -((attn_weights + 1.0e-20).log() * attn_weights) diff --git a/egs/librispeech/ASR/zipformer_ctc/train.py b/egs/librispeech/ASR/zipformer_ctc/train.py index dfe702d2f7..60112a84e7 100755 --- a/egs/librispeech/ASR/zipformer_ctc/train.py +++ b/egs/librispeech/ASR/zipformer_ctc/train.py @@ -46,7 +46,7 @@ from model import CTCModel from optim import Eden, LRScheduler, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.utils import clip_grad_norm_ from torch.utils.tensorboard import SummaryWriter @@ -726,7 +726,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -987,7 +987,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/librispeech/ASR/zipformer_lora/finetune.py b/egs/librispeech/ASR/zipformer_lora/finetune.py index 53152971d8..3f36f229f9 100755 --- a/egs/librispeech/ASR/zipformer_lora/finetune.py +++ b/egs/librispeech/ASR/zipformer_lora/finetune.py @@ -78,7 +78,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -1065,7 +1065,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1406,7 +1406,7 @@ def remove_short_and_long_utt(c: Cut): # params=params, # ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1507,7 +1507,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/zipformer_lora/scaling.py b/egs/librispeech/ASR/zipformer_lora/scaling.py index a1e77fe0e9..8d7aa80275 100644 --- a/egs/librispeech/ASR/zipformer_lora/scaling.py +++ b/egs/librispeech/ASR/zipformer_lora/scaling.py @@ -307,7 +307,7 @@ def forward(ctx, x: Tensor, dim: int): @staticmethod def backward(ctx, ans_grad: Tensor): (ans,) = ctx.saved_tensors - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): ans_grad = ans_grad.to(torch.float32) ans = ans.to(torch.float32) x_grad = ans_grad * ans @@ -863,7 +863,7 @@ def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None, None, None] try: with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x = x.to(torch.float32) x = x.detach() x.requires_grad = True @@ -1118,7 +1118,7 @@ def backward(ctx, x_grad: Tensor): try: with torch.enable_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): x_detached = x_orig.to(torch.float32).detach() x_detached.requires_grad = True @@ -1457,7 +1457,7 @@ def forward(ctx, x: Tensor) -> Tensor: coeff = -0.08 - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): with torch.enable_grad(): x = x.detach() x.requires_grad = True @@ -1534,7 +1534,7 @@ def forward(ctx, x: Tensor) -> Tensor: zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): with torch.enable_grad(): x = x.detach() x.requires_grad = True diff --git a/egs/librispeech/ASR/zipformer_lora/train.py b/egs/librispeech/ASR/zipformer_lora/train.py index 592bc0fd47..9ab214e86e 100755 --- a/egs/librispeech/ASR/zipformer_lora/train.py +++ b/egs/librispeech/ASR/zipformer_lora/train.py @@ -76,7 +76,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -947,7 +947,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1252,7 +1252,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1352,7 +1352,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/ASR/zipformer_lora/zipformer.py b/egs/librispeech/ASR/zipformer_lora/zipformer.py index ece7c3df1a..43865609ad 100644 --- a/egs/librispeech/ASR/zipformer_lora/zipformer.py +++ b/egs/librispeech/ASR/zipformer_lora/zipformer.py @@ -1905,7 +1905,7 @@ def _print_attn_entropy(self, attn_weights: Tensor): (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_weights_entropy = ( -((attn_weights + 1.0e-20).log() * attn_weights) diff --git a/egs/librispeech/ASR/zipformer_mmi/train.py b/egs/librispeech/ASR/zipformer_mmi/train.py index bed3cfa04e..c1785a3282 100755 --- a/egs/librispeech/ASR/zipformer_mmi/train.py +++ b/egs/librispeech/ASR/zipformer_mmi/train.py @@ -64,7 +64,7 @@ from model import CTCModel from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -744,7 +744,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1037,7 +1037,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1138,7 +1138,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/SSL/hubert/finetune.py b/egs/librispeech/SSL/hubert/finetune.py index 9717d579d4..17daa3c9d3 100644 --- a/egs/librispeech/SSL/hubert/finetune.py +++ b/egs/librispeech/SSL/hubert/finetune.py @@ -66,7 +66,7 @@ from model import AsrModel from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -816,7 +816,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1109,7 +1109,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1207,7 +1207,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/SSL/hubert/finetune_ce.py b/egs/librispeech/SSL/hubert/finetune_ce.py index 340aa4aa2f..2723cc770e 100644 --- a/egs/librispeech/SSL/hubert/finetune_ce.py +++ b/egs/librispeech/SSL/hubert/finetune_ce.py @@ -66,7 +66,7 @@ from model import AsrModel from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -816,7 +816,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1109,7 +1109,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1207,7 +1207,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/SSL/hubert/model.py b/egs/librispeech/SSL/hubert/model.py index b23fa32ea1..46a968b69e 100644 --- a/egs/librispeech/SSL/hubert/model.py +++ b/egs/librispeech/SSL/hubert/model.py @@ -221,7 +221,7 @@ def forward_transducer( # if self.training and random.random() < 0.25: # am = penalize_abs_values_gt(am, 30.0, 1.0e-04) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -256,7 +256,7 @@ def forward_transducer( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/SSL/hubert/pretrain.py b/egs/librispeech/SSL/hubert/pretrain.py index 1868bf0a69..f183d90fd2 100644 --- a/egs/librispeech/SSL/hubert/pretrain.py +++ b/egs/librispeech/SSL/hubert/pretrain.py @@ -59,7 +59,7 @@ from optim import Eden, ScaledAdam from ssl_datamodule import LibriSpeechDataModule from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.functional import pad from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -644,7 +644,7 @@ def save_bad_model(suffix: str = ""): batch_size = batch["kmeans"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -945,7 +945,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1036,7 +1036,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/SSL/hubert/pretrain_ce.py b/egs/librispeech/SSL/hubert/pretrain_ce.py index 97efd983bc..94948695d6 100644 --- a/egs/librispeech/SSL/hubert/pretrain_ce.py +++ b/egs/librispeech/SSL/hubert/pretrain_ce.py @@ -59,7 +59,7 @@ from optim import Eden, ScaledAdam from ssl_datamodule import LibriSpeechDataModule from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.functional import pad from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -644,7 +644,7 @@ def save_bad_model(suffix: str = ""): batch_size = batch["kmeans"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -945,7 +945,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1036,7 +1036,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/SSL/zipformer/finetune.py b/egs/librispeech/SSL/zipformer/finetune.py index 6bfab9d00a..c907b41c55 100644 --- a/egs/librispeech/SSL/zipformer/finetune.py +++ b/egs/librispeech/SSL/zipformer/finetune.py @@ -66,7 +66,7 @@ from model import AsrModel from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -1115,7 +1115,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1406,7 +1406,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1504,7 +1504,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/SSL/zipformer/model.py b/egs/librispeech/SSL/zipformer/model.py index b23fa32ea1..46a968b69e 100644 --- a/egs/librispeech/SSL/zipformer/model.py +++ b/egs/librispeech/SSL/zipformer/model.py @@ -221,7 +221,7 @@ def forward_transducer( # if self.training and random.random() < 0.25: # am = penalize_abs_values_gt(am, 30.0, 1.0e-04) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -256,7 +256,7 @@ def forward_transducer( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/librispeech/SSL/zipformer/pretrain.py b/egs/librispeech/SSL/zipformer/pretrain.py index 767c3bacb1..937fb382ed 100644 --- a/egs/librispeech/SSL/zipformer/pretrain.py +++ b/egs/librispeech/SSL/zipformer/pretrain.py @@ -58,7 +58,7 @@ from optim import Eden, ScaledAdam from ssl_datamodule import LibriSpeechDataModule from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -944,7 +944,7 @@ def save_bad_model(suffix: str = ""): batch_size = batch["kmeans"].shape[0] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1243,7 +1243,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1334,7 +1334,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/SSL/zipformer/zipformer.py b/egs/librispeech/SSL/zipformer/zipformer.py index 7e9ccb51fd..e9eff3357e 100644 --- a/egs/librispeech/SSL/zipformer/zipformer.py +++ b/egs/librispeech/SSL/zipformer/zipformer.py @@ -1849,7 +1849,7 @@ def _print_attn_entropy(self, attn_weights: Tensor): (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape with torch.no_grad(): - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): attn_weights = attn_weights.to(torch.float32) attn_weights_entropy = ( -((attn_weights + 1.0e-20).log() * attn_weights) diff --git a/egs/librispeech/WSASR/conformer_ctc2/train.py b/egs/librispeech/WSASR/conformer_ctc2/train.py index fc77285625..82c68803fe 100755 --- a/egs/librispeech/WSASR/conformer_ctc2/train.py +++ b/egs/librispeech/WSASR/conformer_ctc2/train.py @@ -62,7 +62,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -757,7 +757,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1005,7 +1005,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1076,7 +1076,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/librispeech/WSASR/conformer_ctc2/train_phone.py b/egs/librispeech/WSASR/conformer_ctc2/train_phone.py index 1c4bd50bfa..b276d05879 100755 --- a/egs/librispeech/WSASR/conformer_ctc2/train_phone.py +++ b/egs/librispeech/WSASR/conformer_ctc2/train_phone.py @@ -62,7 +62,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -758,7 +758,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1007,7 +1007,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1078,7 +1078,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/libritts/ASR/zipformer/train.py b/egs/libritts/ASR/zipformer/train.py index 78e3330bd1..5485eaf0ab 100755 --- a/egs/libritts/ASR/zipformer/train.py +++ b/egs/libritts/ASR/zipformer/train.py @@ -80,7 +80,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -1049,8 +1049,8 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast( - "cuda", enabled=params.use_autocast, dtype=params.dtype + with torch.cuda.amp.autocast( + enabled=params.use_autocast, dtype=params.dtype ): loss, loss_info = compute_loss( params=params, @@ -1378,7 +1378,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_autocast, init_scale=1.0) + scaler = GradScaler(enabled=params.use_autocast, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1478,8 +1478,8 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast( - "cuda", enabled=params.use_autocast, dtype=params.dtype + with torch.cuda.amp.autocast( + enabled=params.use_autocast, dtype=params.dtype ): loss, _ = compute_loss( params=params, diff --git a/egs/libritts/CODEC/encodec/encodec.py b/egs/libritts/CODEC/encodec/encodec.py index 31fc4f1261..f21d494b62 100644 --- a/egs/libritts/CODEC/encodec/encodec.py +++ b/egs/libritts/CODEC/encodec/encodec.py @@ -29,7 +29,7 @@ WavReconstructionLoss, ) from torch import nn -from torch.amp import autocast +from torch.cuda.amp import autocast class Encodec(nn.Module): @@ -148,7 +148,7 @@ def _forward_generator( ) # calculate losses - with autocast("cuda", enabled=False): + with autocast(enabled=False): gen_stft_adv_loss = self.generator_adversarial_loss(outputs=y_hat) if self.multi_period_discriminator is not None: @@ -272,7 +272,7 @@ def _forward_discriminator( speech_hat.contiguous().detach(), ) # calculate losses - with autocast("cuda", enabled=False): + with autocast(enabled=False): ( disc_stft_real_adv_loss, disc_stft_fake_adv_loss, diff --git a/egs/libritts/CODEC/encodec/train.py b/egs/libritts/CODEC/encodec/train.py index 31349df43f..a4f2eb7ab7 100755 --- a/egs/libritts/CODEC/encodec/train.py +++ b/egs/libritts/CODEC/encodec/train.py @@ -34,7 +34,7 @@ from lhotse.utils import fix_random_seed from scheduler import WarmupCosineLrScheduler from torch import nn -from torch.amp import GradScaler, autocast +from torch.cuda.amp import GradScaler, autocast from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import Optimizer from torch.utils.tensorboard import SummaryWriter @@ -466,7 +466,7 @@ def save_bad_model(suffix: str = ""): loss_info["samples"] = batch_size try: - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): d_weight = train_discriminator( params.lambda_adv, params.cur_epoch, @@ -502,7 +502,7 @@ def save_bad_model(suffix: str = ""): scaler.scale(disc_loss).backward() scaler.step(optimizer_d) - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): g_weight = train_discriminator( params.lambda_adv, params.cur_epoch, @@ -846,7 +846,7 @@ def scan_pessimistic_batches_for_oom( ) = prepare_input(params, batch, device) try: # for discriminator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): ( disc_stft_real_adv_loss, disc_stft_fake_adv_loss, @@ -876,7 +876,7 @@ def scan_pessimistic_batches_for_oom( optimizer_d.zero_grad() loss_d.backward() # for generator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): ( commit_loss, gen_stft_adv_loss, @@ -1102,7 +1102,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/libritts/TTS/vits/train.py b/egs/libritts/TTS/vits/train.py index 6803d6eb2a..447fbcf5db 100755 --- a/egs/libritts/TTS/vits/train.py +++ b/egs/libritts/TTS/vits/train.py @@ -32,7 +32,7 @@ from lhotse.features.io import KaldiReader from lhotse.utils import fix_random_seed from tokenizer import Tokenizer -from torch.amp import GradScaler, autocast +from torch.cuda.amp import GradScaler, autocast from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import Optimizer from torch.utils.tensorboard import SummaryWriter @@ -456,7 +456,7 @@ def save_bad_model(suffix: str = ""): loss_info["samples"] = batch_size try: - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): # forward discriminator loss_d, stats_d = model( text=tokens, @@ -475,7 +475,7 @@ def save_bad_model(suffix: str = ""): scaler.scale(loss_d).backward() scaler.step(optimizer_d) - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): # forward generator loss_g, stats_g = model( text=tokens, @@ -748,7 +748,7 @@ def scan_pessimistic_batches_for_oom( ) = prepare_input(batch, tokenizer, device, train_speaker_map) try: # for discriminator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): loss_d, stats_d = model( text=tokens, text_lengths=tokens_lens, @@ -762,7 +762,7 @@ def scan_pessimistic_batches_for_oom( optimizer_d.zero_grad() loss_d.backward() # for generator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): loss_g, stats_g = model( text=tokens, text_lengths=tokens_lens, @@ -922,7 +922,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/ljspeech/TTS/matcha/train.py b/egs/ljspeech/TTS/matcha/train.py index a25cc87234..853042413c 100755 --- a/egs/ljspeech/TTS/matcha/train.py +++ b/egs/ljspeech/TTS/matcha/train.py @@ -17,7 +17,7 @@ from model import fix_len_compatibility from models.matcha_tts import MatchaTTS from tokenizer import Tokenizer -from torch.amp import GradScaler, autocast +from torch.cuda.amp import GradScaler, autocast from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import Optimizer from torch.utils.tensorboard import SummaryWriter @@ -474,7 +474,7 @@ def save_bad_model(suffix: str = ""): tokens_lens, ) = prepare_input(batch, tokenizer, device, params) try: - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): losses = get_losses( { "x": tokens, @@ -645,7 +645,7 @@ def run(rank, world_size, args): valid_cuts = ljspeech.valid_cuts() valid_dl = ljspeech.valid_dataloaders(valid_cuts) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/ljspeech/TTS/vits/train.py b/egs/ljspeech/TTS/vits/train.py index e9994319a3..184ae79afa 100755 --- a/egs/ljspeech/TTS/vits/train.py +++ b/egs/ljspeech/TTS/vits/train.py @@ -30,7 +30,7 @@ from lhotse.cut import Cut from lhotse.utils import fix_random_seed from tokenizer import Tokenizer -from torch.amp import GradScaler, autocast +from torch.cuda.amp import GradScaler, autocast from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import Optimizer from torch.utils.tensorboard import SummaryWriter @@ -396,7 +396,7 @@ def save_bad_model(suffix: str = ""): loss_info["samples"] = batch_size try: - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): # forward discriminator loss_d, stats_d = model( text=tokens, @@ -414,7 +414,7 @@ def save_bad_model(suffix: str = ""): scaler.scale(loss_d).backward() scaler.step(optimizer_d) - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): # forward generator loss_g, stats_g = model( text=tokens, @@ -673,7 +673,7 @@ def scan_pessimistic_batches_for_oom( ) try: # for discriminator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): loss_d, stats_d = model( text=tokens, text_lengths=tokens_lens, @@ -686,7 +686,7 @@ def scan_pessimistic_batches_for_oom( optimizer_d.zero_grad() loss_d.backward() # for generator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): loss_g, stats_g = model( text=tokens, text_lengths=tokens_lens, @@ -838,7 +838,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/ljspeech/TTS/vits/utils.py b/egs/ljspeech/TTS/vits/utils.py index d51ff5f5c5..6a067f5961 100644 --- a/egs/ljspeech/TTS/vits/utils.py +++ b/egs/ljspeech/TTS/vits/utils.py @@ -23,7 +23,7 @@ import torch.distributed as dist import torch.nn as nn from lhotse.dataset.sampling.base import CutSampler -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import Optimizer from torch.utils.tensorboard import SummaryWriter diff --git a/egs/ljspeech/TTS/vits/vits.py b/egs/ljspeech/TTS/vits/vits.py index 6fd6d219ba..a1fabf9ad6 100644 --- a/egs/ljspeech/TTS/vits/vits.py +++ b/egs/ljspeech/TTS/vits/vits.py @@ -25,7 +25,7 @@ KLDivergenceLoss, MelSpectrogramLoss, ) -from torch.amp import autocast +from torch.cuda.amp import autocast from utils import get_segments AVAILABLE_GENERATERS = { @@ -410,7 +410,7 @@ def _forward_generator( p = self.discriminator(speech_) # calculate losses - with autocast("cuda", enabled=False): + with autocast(enabled=False): if not return_sample: mel_loss = self.mel_loss(speech_hat_, speech_) else: @@ -518,7 +518,7 @@ def _forward_discrminator( p = self.discriminator(speech_) # calculate losses - with autocast("cuda", enabled=False): + with autocast(enabled=False): real_loss, fake_loss = self.discriminator_adv_loss(p_hat, p) loss = real_loss + fake_loss diff --git a/egs/mdcc/ASR/zipformer/train.py b/egs/mdcc/ASR/zipformer/train.py index 22249286ae..730db77180 100755 --- a/egs/mdcc/ASR/zipformer/train.py +++ b/egs/mdcc/ASR/zipformer/train.py @@ -68,7 +68,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -906,7 +906,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1197,7 +1197,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1298,7 +1298,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/mgb2/ASR/pruned_transducer_stateless5/train.py b/egs/mgb2/ASR/pruned_transducer_stateless5/train.py index 916ada4758..48468cfbdb 100755 --- a/egs/mgb2/ASR/pruned_transducer_stateless5/train.py +++ b/egs/mgb2/ASR/pruned_transducer_stateless5/train.py @@ -66,7 +66,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.utils import clip_grad_norm_ from torch.utils.tensorboard import SummaryWriter @@ -751,7 +751,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info, inf_flag = compute_loss( params=params, model=model, @@ -1012,7 +1012,7 @@ def remove_short_and_long_text(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1115,7 +1115,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _, _ = compute_loss( params=params, model=model, diff --git a/egs/multi_zh-hans/ASR/whisper/train.py b/egs/multi_zh-hans/ASR/whisper/train.py index 1a11d01af0..fe2d950c1c 100755 --- a/egs/multi_zh-hans/ASR/whisper/train.py +++ b/egs/multi_zh-hans/ASR/whisper/train.py @@ -61,7 +61,7 @@ from multi_dataset import MultiDataset from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.functional import pad as pad_tensor from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -566,7 +566,7 @@ def compute_validation_loss( tot_loss = MetricsTracker() for batch_idx, batch in enumerate(valid_dl): - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, @@ -675,7 +675,7 @@ def train_one_epoch( ) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, @@ -913,7 +913,7 @@ def remove_short_and_long_utt(c: Cut): valid_cuts = multi_dataset.dev_cuts() valid_dl = data_module.valid_dataloaders(valid_cuts) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/multi_zh-hans/ASR/zipformer/train.py b/egs/multi_zh-hans/ASR/zipformer/train.py index 047253d5b9..3dbfc48eb9 100755 --- a/egs/multi_zh-hans/ASR/zipformer/train.py +++ b/egs/multi_zh-hans/ASR/zipformer/train.py @@ -75,7 +75,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -987,7 +987,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1278,7 +1278,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1378,7 +1378,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/multi_zh_en/ASR/zipformer/train.py b/egs/multi_zh_en/ASR/zipformer/train.py index 9e64defa33..04bb41214d 100755 --- a/egs/multi_zh_en/ASR/zipformer/train.py +++ b/egs/multi_zh_en/ASR/zipformer/train.py @@ -75,7 +75,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -969,7 +969,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1269,7 +1269,7 @@ def tokenize_and_encode_text(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1369,7 +1369,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/reazonspeech/ASR/zipformer/do_not_use_it_directly.py b/egs/reazonspeech/ASR/zipformer/do_not_use_it_directly.py index c01e4d3364..072679cfc9 100755 --- a/egs/reazonspeech/ASR/zipformer/do_not_use_it_directly.py +++ b/egs/reazonspeech/ASR/zipformer/do_not_use_it_directly.py @@ -67,7 +67,7 @@ from optim import Eden, ScaledAdam from tokenizer import Tokenizer from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer_for_ncnn_export_only import Zipformer @@ -822,7 +822,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1113,7 +1113,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1213,7 +1213,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/reazonspeech/ASR/zipformer/train.py b/egs/reazonspeech/ASR/zipformer/train.py index 8829a18caa..30bd3efbad 100755 --- a/egs/reazonspeech/ASR/zipformer/train.py +++ b/egs/reazonspeech/ASR/zipformer/train.py @@ -74,7 +74,7 @@ from subsampling import Conv2dSubsampling from tokenizer import Tokenizer from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -945,7 +945,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1235,7 +1235,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1335,7 +1335,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/speech_llm/ASR_LLM/whisper_llm_zh/train.py b/egs/speech_llm/ASR_LLM/whisper_llm_zh/train.py index 5de2cf2b0a..5f224c9848 100755 --- a/egs/speech_llm/ASR_LLM/whisper_llm_zh/train.py +++ b/egs/speech_llm/ASR_LLM/whisper_llm_zh/train.py @@ -451,7 +451,7 @@ def compute_validation_loss( tot_loss = MetricsTracker() for batch_idx, batch in enumerate(valid_dl): - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, @@ -566,7 +566,7 @@ def train_one_epoch( f"rm -rf {params.exp_dir}/epoch-{params.cur_epoch}-checkpoint-{batch_idx}" ) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, diff --git a/egs/spgispeech/ASR/pruned_transducer_stateless2/train.py b/egs/spgispeech/ASR/pruned_transducer_stateless2/train.py index 1e55ada87f..a9146a0feb 100755 --- a/egs/spgispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/train.py @@ -65,7 +65,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -649,7 +649,7 @@ def train_one_epoch( params.batch_idx_train += 1 batch_size = len(batch["supervisions"]["text"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -857,7 +857,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -957,7 +957,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/spgispeech/ASR/zipformer/train.py b/egs/spgispeech/ASR/zipformer/train.py index 319713b027..dfc21c968a 100755 --- a/egs/spgispeech/ASR/zipformer/train.py +++ b/egs/spgispeech/ASR/zipformer/train.py @@ -74,7 +74,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -946,7 +946,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1217,7 +1217,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1317,7 +1317,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/tal_csasr/ASR/pruned_transducer_stateless5/train.py b/egs/tal_csasr/ASR/pruned_transducer_stateless5/train.py index c44e30b89e..c0aedd725a 100755 --- a/egs/tal_csasr/ASR/pruned_transducer_stateless5/train.py +++ b/egs/tal_csasr/ASR/pruned_transducer_stateless5/train.py @@ -69,7 +69,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -726,7 +726,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) # print(batch["supervisions"]) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -967,7 +967,7 @@ def text_normalize_for_cut(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1039,7 +1039,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/train.py b/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/train.py index dd9576d994..2108266ec3 100755 --- a/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/train.py +++ b/egs/tal_csasr/ASR/pruned_transducer_stateless7_bbpe/train.py @@ -64,7 +64,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -801,7 +801,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1101,7 +1101,7 @@ def tokenize_text_in_cut(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1201,7 +1201,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/tedlium3/ASR/conformer_ctc2/train.py b/egs/tedlium3/ASR/conformer_ctc2/train.py index 179dcf14a7..fc3e3b2d92 100755 --- a/egs/tedlium3/ASR/conformer_ctc2/train.py +++ b/egs/tedlium3/ASR/conformer_ctc2/train.py @@ -57,7 +57,7 @@ from lhotse.utils import fix_random_seed from local.convert_transcript_words_to_bpe_ids import convert_texts_into_ids from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -710,7 +710,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -941,7 +941,7 @@ def run(rank, world_size, args): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1011,7 +1011,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/tedlium3/ASR/zipformer/model.py b/egs/tedlium3/ASR/zipformer/model.py index 0d9b395edf..65b052ab94 100644 --- a/egs/tedlium3/ASR/zipformer/model.py +++ b/egs/tedlium3/ASR/zipformer/model.py @@ -173,7 +173,7 @@ def forward( # if self.training and random.random() < 0.25: # am = penalize_abs_values_gt(am, 30.0, 1.0e-04) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed( lm=lm.float(), am=am.float(), @@ -209,7 +209,7 @@ def forward( # prior to do_rnnt_pruning (this is an optimization for speed). logits = self.joiner(am_pruned, lm_pruned, project_input=False) - with torch.amp.autocast("cuda", enabled=False): + with torch.cuda.amp.autocast(enabled=False): pruned_loss = k2.rnnt_loss_pruned( logits=logits.float(), symbols=y_padded, diff --git a/egs/tedlium3/ASR/zipformer/train.py b/egs/tedlium3/ASR/zipformer/train.py index ffe8768633..14a44efb30 100755 --- a/egs/tedlium3/ASR/zipformer/train.py +++ b/egs/tedlium3/ASR/zipformer/train.py @@ -73,7 +73,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -911,7 +911,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1160,7 +1160,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1260,7 +1260,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/vctk/TTS/vits/train.py b/egs/vctk/TTS/vits/train.py index 6249640d43..4686de1694 100755 --- a/egs/vctk/TTS/vits/train.py +++ b/egs/vctk/TTS/vits/train.py @@ -31,7 +31,7 @@ from lhotse.cut import Cut from lhotse.utils import fix_random_seed from tokenizer import Tokenizer -from torch.amp import GradScaler, autocast +from torch.cuda.amp import GradScaler, autocast from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import Optimizer from torch.utils.tensorboard import SummaryWriter @@ -448,7 +448,7 @@ def save_bad_model(suffix: str = ""): loss_info["samples"] = batch_size try: - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): # forward discriminator loss_d, stats_d = model( text=tokens, @@ -467,7 +467,7 @@ def save_bad_model(suffix: str = ""): scaler.scale(loss_d).backward() scaler.step(optimizer_d) - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): # forward generator loss_g, stats_g = model( text=tokens, @@ -740,7 +740,7 @@ def scan_pessimistic_batches_for_oom( ) = prepare_input(batch, tokenizer, device, speaker_map) try: # for discriminator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): loss_d, stats_d = model( text=tokens, text_lengths=tokens_lens, @@ -754,7 +754,7 @@ def scan_pessimistic_batches_for_oom( optimizer_d.zero_grad() loss_d.backward() # for generator - with autocast("cuda", enabled=params.use_fp16): + with autocast(enabled=params.use_fp16): loss_g, stats_g = model( text=tokens, text_lengths=tokens_lens, @@ -910,7 +910,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless2/finetune.py b/egs/wenetspeech/ASR/pruned_transducer_stateless2/finetune.py index 2fd6f64784..c34f1593d1 100755 --- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/finetune.py +++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/finetune.py @@ -52,7 +52,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -718,7 +718,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -907,7 +907,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1005,7 +1005,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless2/train.py b/egs/wenetspeech/ASR/pruned_transducer_stateless2/train.py index c90f03f080..49977e01b5 100644 --- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/train.py @@ -101,7 +101,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -687,7 +687,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -921,7 +921,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1019,7 +1019,7 @@ def scan_pessimistic_batches_for_oom( # warmup = 0.0 is so that the derivs for the pruned loss stay zero # (i.e. are not remembered by the decaying-average in adam), because # we want to avoid these params being subject to shrinkage in adam. - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless5/train.py b/egs/wenetspeech/ASR/pruned_transducer_stateless5/train.py index 7b05eca973..931e699d92 100755 --- a/egs/wenetspeech/ASR/pruned_transducer_stateless5/train.py +++ b/egs/wenetspeech/ASR/pruned_transducer_stateless5/train.py @@ -81,7 +81,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -796,7 +796,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1056,7 +1056,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1158,7 +1158,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/wenetspeech/ASR/whisper/train.py b/egs/wenetspeech/ASR/whisper/train.py index c46a4d84c9..4e55fd6a86 100644 --- a/egs/wenetspeech/ASR/whisper/train.py +++ b/egs/wenetspeech/ASR/whisper/train.py @@ -61,7 +61,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.functional import pad as pad_tensor from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -513,7 +513,7 @@ def compute_validation_loss( tot_loss = MetricsTracker() for batch_idx, batch in enumerate(valid_dl): - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, @@ -621,7 +621,7 @@ def train_one_epoch( f"rm -rf {params.exp_dir}/epoch-{params.cur_epoch}-checkpoint-{batch_idx}" ) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, tokenizer=tokenizer, @@ -843,7 +843,7 @@ def remove_short_and_long_utt(c: Cut): train_dl = wenetspeech.train_dataloaders(train_cuts) valid_dl = wenetspeech.valid_dataloaders(wenetspeech.valid_cuts()) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/wenetspeech/ASR/zipformer/train.py b/egs/wenetspeech/ASR/zipformer/train.py index b6d55447f7..25b16f6324 100755 --- a/egs/wenetspeech/ASR/zipformer/train.py +++ b/egs/wenetspeech/ASR/zipformer/train.py @@ -71,7 +71,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -910,7 +910,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1201,7 +1201,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1302,7 +1302,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/wenetspeech/KWS/zipformer/finetune.py b/egs/wenetspeech/KWS/zipformer/finetune.py index 00db4309d5..d19172b38e 100755 --- a/egs/wenetspeech/KWS/zipformer/finetune.py +++ b/egs/wenetspeech/KWS/zipformer/finetune.py @@ -82,7 +82,7 @@ from lhotse.utils import fix_random_seed from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from train import ( @@ -414,7 +414,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -703,7 +703,7 @@ def encode_text(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) diff --git a/egs/wenetspeech/KWS/zipformer/train.py b/egs/wenetspeech/KWS/zipformer/train.py index 4dc30ad89f..40960c2ae7 100755 --- a/egs/wenetspeech/KWS/zipformer/train.py +++ b/egs/wenetspeech/KWS/zipformer/train.py @@ -73,7 +73,7 @@ from scaling import ScheduledFloat from subsampling import Conv2dSubsampling from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer2 @@ -967,7 +967,7 @@ def save_bad_model(suffix: str = ""): batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1252,7 +1252,7 @@ def encode_text(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1353,7 +1353,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/wenetspeech4tts/TTS/valle/train.py b/egs/wenetspeech4tts/TTS/valle/train.py index 1c6972e93c..e9ec548f33 100755 --- a/egs/wenetspeech4tts/TTS/valle/train.py +++ b/egs/wenetspeech4tts/TTS/valle/train.py @@ -65,7 +65,7 @@ from optim import Eden, ScaledAdam from tokenizer import TextTokenCollater, get_text_token_collater from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from tts_datamodule import TtsDataModule @@ -764,7 +764,7 @@ def train_one_epoch( batch_size = len(batch["text"]) try: - with torch.amp.autocast("cuda", dtype=dtype, enabled=enabled): + with torch.cuda.amp.autocast(dtype=dtype, enabled=enabled): _, loss, loss_info = compute_loss( params=params, model=model, @@ -897,7 +897,7 @@ def train_one_epoch( # Calculate validation loss in Rank 0 model.eval() logging.info("Computing validation loss") - with torch.amp.autocast("cuda", dtype=dtype): + with torch.cuda.amp.autocast(dtype=dtype): valid_info = compute_validation_loss( params=params, model=model, @@ -1102,9 +1102,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler( - "cuda", enabled=(params.dtype in ["fp16", "float16"]), init_scale=1.0 - ) + scaler = GradScaler(enabled=(params.dtype in ["fp16", "float16"]), init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1198,7 +1196,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", dtype=dtype): + with torch.cuda.amp.autocast(dtype=dtype): _, loss, _ = compute_loss( params=params, model=model, diff --git a/egs/xbmu_amdo31/ASR/pruned_transducer_stateless5/train.py b/egs/xbmu_amdo31/ASR/pruned_transducer_stateless5/train.py index 5c3000a57c..a6fa46b171 100755 --- a/egs/xbmu_amdo31/ASR/pruned_transducer_stateless5/train.py +++ b/egs/xbmu_amdo31/ASR/pruned_transducer_stateless5/train.py @@ -68,7 +68,7 @@ from model import Transducer from optim import Eden, Eve from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -814,7 +814,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1072,7 +1072,7 @@ def remove_short_and_long_utt(c: Cut): warmup=0.0 if params.start_epoch == 1 else 1.0, ) - scaler = GradScaler("cuda", enabled=params.use_fp16) + scaler = GradScaler(enabled=params.use_fp16) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1141,7 +1141,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/egs/xbmu_amdo31/ASR/pruned_transducer_stateless7/train.py b/egs/xbmu_amdo31/ASR/pruned_transducer_stateless7/train.py index a1b3be2468..dd72551d9e 100755 --- a/egs/xbmu_amdo31/ASR/pruned_transducer_stateless7/train.py +++ b/egs/xbmu_amdo31/ASR/pruned_transducer_stateless7/train.py @@ -67,7 +67,7 @@ from model import Transducer from optim import Eden, ScaledAdam from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from zipformer import Zipformer @@ -785,7 +785,7 @@ def train_one_epoch( batch_size = len(batch["supervisions"]["text"]) try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( params=params, model=model, @@ -1074,7 +1074,7 @@ def remove_short_and_long_utt(c: Cut): params=params, ) - scaler = GradScaler("cuda", enabled=params.use_fp16, init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1174,7 +1174,7 @@ def scan_pessimistic_batches_for_oom( for criterion, cuts in batches.items(): batch = train_dl.dataset[cuts] try: - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, _ = compute_loss( params=params, model=model, diff --git a/icefall/checkpoint.py b/icefall/checkpoint.py index b3a0fb8657..d31ce13019 100644 --- a/icefall/checkpoint.py +++ b/icefall/checkpoint.py @@ -27,7 +27,7 @@ import torch.nn as nn from lhotse.dataset.sampling.base import CutSampler from torch import Tensor -from torch.amp import GradScaler +from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.optim import Optimizer diff --git a/icefall/rnn_lm/train.py b/icefall/rnn_lm/train.py index 257cdb09a9..0178b80bfc 100755 --- a/icefall/rnn_lm/train.py +++ b/icefall/rnn_lm/train.py @@ -401,7 +401,7 @@ def compute_validation_loss( for batch_idx, batch in enumerate(valid_dl): x, y, sentence_lengths = batch - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( model=model, x=x, @@ -470,7 +470,7 @@ def train_one_epoch( params.batch_idx_train += 1 x, y, sentence_lengths = batch batch_size = x.size(0) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( model=model, x=x, diff --git a/icefall/transformer_lm/train.py b/icefall/transformer_lm/train.py index 6faa634843..c36abfcdfa 100644 --- a/icefall/transformer_lm/train.py +++ b/icefall/transformer_lm/train.py @@ -341,7 +341,7 @@ def compute_validation_loss( for batch_idx, batch in enumerate(valid_dl): x, y, sentence_lengths = batch - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( model=model, x=x, @@ -403,7 +403,7 @@ def train_one_epoch( params.batch_idx_train += 1 x, y, sentence_lengths = batch batch_size = x.size(0) - with torch.amp.autocast("cuda", enabled=params.use_fp16): + with torch.cuda.amp.autocast(enabled=params.use_fp16): loss, loss_info = compute_loss( model=model, x=x,