diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py
index 99fe647938..828106f418 100755
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@@ -557,7 +557,6 @@ def train_one_epoch(
             )
 
         if batch_idx % params.log_interval == 0:
-
             if tb_writer is not None:
                 loss_info.write_summary(
                     tb_writer, "train/current_", params.batch_idx_train
diff --git a/egs/librispeech/ASR/local/download_lm.py b/egs/librispeech/ASR/local/download_lm.py
index da1648d069..5a36ff2a94 100755
--- a/egs/librispeech/ASR/local/download_lm.py
+++ b/egs/librispeech/ASR/local/download_lm.py
@@ -43,6 +43,7 @@
 
 from tqdm.auto import tqdm
 
+
 # This function is copied from lhotse
 def tqdm_urlretrieve_hook(t):
     """Wraps tqdm instance.
diff --git a/egs/librispeech/ASR/long_file_recog/beam_search.py b/egs/librispeech/ASR/long_file_recog/beam_search.py
index f8c31861c2..b65e9d40a3 100644
--- a/egs/librispeech/ASR/long_file_recog/beam_search.py
+++ b/egs/librispeech/ASR/long_file_recog/beam_search.py
@@ -236,7 +236,7 @@ def greedy_search_batch(
     encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
 
     offset = 0
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]
@@ -507,7 +507,7 @@ def modified_beam_search(
 
     offset = 0
     finalized_B = []
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]
diff --git a/egs/librispeech/ASR/long_file_recog/merge_chunks.py b/egs/librispeech/ASR/long_file_recog/merge_chunks.py
index d38d9c86a0..9e31e00d55 100755
--- a/egs/librispeech/ASR/long_file_recog/merge_chunks.py
+++ b/egs/librispeech/ASR/long_file_recog/merge_chunks.py
@@ -162,7 +162,6 @@ def _merge(cut_list: List[Cut], rec_id: str, utt_idx: int):
 
     futures = []
     with ThreadPoolExecutor(max_workers=1) as executor:
-
         for cut in cuts_chunk:
             cur_rec_id = cut.recording.id
             if len(cut_list) == 0:
diff --git a/egs/librispeech/ASR/long_file_recog/recognize.py b/egs/librispeech/ASR/long_file_recog/recognize.py
index 96c83f8591..466253446f 100755
--- a/egs/librispeech/ASR/long_file_recog/recognize.py
+++ b/egs/librispeech/ASR/long_file_recog/recognize.py
@@ -264,6 +264,7 @@ def decode_dataset(
         - timestamps of reference transcript
         - timestamps of predicted result
     """
+
     #  Background worker to add alignemnt and save cuts to disk.
     def _save_worker(
         cuts: List[Cut],
diff --git a/egs/librispeech/ASR/pruned2_knowledge/optim.py b/egs/librispeech/ASR/pruned2_knowledge/optim.py
index 76cd4e11ee..9f287ce70e 100644
--- a/egs/librispeech/ASR/pruned2_knowledge/optim.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/optim.py
@@ -66,7 +66,6 @@ def __init__(
         weight_decay=1e-3,
         target_rms=0.1,
     ):
-
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
index 3298568a33..7fcd242fcd 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@@ -719,7 +719,7 @@ def greedy_search_batch(
     encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
 
     offset = 0
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]
@@ -1019,7 +1019,7 @@ def modified_beam_search(
 
     offset = 0
     finalized_B = []
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]
@@ -1227,7 +1227,7 @@ def modified_beam_search_lm_rescore(
 
     offset = 0
     finalized_B = []
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]
@@ -1427,7 +1427,7 @@ def modified_beam_search_lm_rescore_LODR(
 
     offset = 0
     finalized_B = []
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]
@@ -2608,7 +2608,6 @@ def modified_beam_search_LODR(
                 context_score = 0
                 new_context_state = None if context_graph is None else hyp.context_state
                 if new_token not in (blank_id, unk_id):
-
                     if context_graph is not None:
                         (
                             context_score,
@@ -2758,7 +2757,7 @@ def modified_beam_search_lm_shallow_fusion(
 
     offset = 0
     finalized_B = []
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]  # get batch
@@ -2900,7 +2899,6 @@ def modified_beam_search_lm_shallow_fusion(
                 new_token = topk_token_indexes[k]
                 new_timestamp = hyp.timestamp[:]
                 if new_token not in (blank_id, unk_id):
-
                     ys.append(new_token)
                     new_timestamp.append(t)
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
index 2d7f557ad9..f54bc2709e 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
@@ -66,7 +66,6 @@ def __init__(
         weight_decay=1e-3,
         target_rms=0.1,
     ):
-
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
index 963ebdc2dd..91d64c1df4 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py
@@ -528,7 +528,6 @@ def _flatten_parameters(self, flat_weights) -> None:
             return
 
         with torch.cuda.device_of(first_fw):
-
             # Note: no_grad() is necessary since _cudnn_rnn_flatten_weight is
             # an inplace operation on self._flat_weights
             with torch.no_grad():
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
index 14ff86f233..3bca7db2cc 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
@@ -56,7 +56,6 @@ class CodebookIndexExtractor:
     """
 
     def __init__(self, params: AttributeDict):
-
         self.params = params
         params.subsets = ["clean-100"]
         if self.params.full_libri:
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/alignment.py b/egs/librispeech/ASR/pruned_transducer_stateless7/alignment.py
index 76cd56bbb7..bfb5fe6093 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/alignment.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/alignment.py
@@ -111,7 +111,7 @@ def batch_force_alignment(
 
     offset = 0
     finalized_B = []
-    for (t, batch_size) in enumerate(batch_size_list):
+    for t, batch_size in enumerate(batch_size_list):
         start = offset
         end = offset + batch_size
         current_encoder_out = encoder_out.data[start:end]
diff --git a/egs/librispeech/ASR/streaming_conformer_ctc/train.py b/egs/librispeech/ASR/streaming_conformer_ctc/train.py
index bb55ed6bb4..14d7274c2f 100755
--- a/egs/librispeech/ASR/streaming_conformer_ctc/train.py
+++ b/egs/librispeech/ASR/streaming_conformer_ctc/train.py
@@ -543,7 +543,6 @@ def train_one_epoch(
             )
 
         if batch_idx % params.log_interval == 0:
-
             if tb_writer is not None:
                 loss_info.write_summary(
                     tb_writer, "train/current_", params.batch_idx_train
diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
index 0aa1587bae..90245ed463 100755
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
@@ -463,7 +463,6 @@ def train_one_epoch(
                 f"tot_loss[{tot_loss}], batch size: {batch_size}"
             )
         if batch_idx % params.log_interval == 0:
-
             if tb_writer is not None:
                 loss_info.write_summary(
                     tb_writer, "train/current_", params.batch_idx_train
diff --git a/egs/librispeech/ASR/transducer/train.py b/egs/librispeech/ASR/transducer/train.py
index f2a09346c4..9ac6b7d03d 100755
--- a/egs/librispeech/ASR/transducer/train.py
+++ b/egs/librispeech/ASR/transducer/train.py
@@ -513,7 +513,6 @@ def train_one_epoch(
             )
 
         if batch_idx % params.log_interval == 0:
-
             if tb_writer is not None:
                 loss_info.write_summary(
                     tb_writer, "train/current_", params.batch_idx_train
diff --git a/egs/librispeech/ASR/transducer_lstm/train.py b/egs/librispeech/ASR/transducer_lstm/train.py
index a6f2bd08c0..92134116cd 100755
--- a/egs/librispeech/ASR/transducer_lstm/train.py
+++ b/egs/librispeech/ASR/transducer_lstm/train.py
@@ -517,7 +517,6 @@ def train_one_epoch(
             )
 
         if batch_idx % params.log_interval == 0:
-
             if tb_writer is not None:
                 loss_info.write_summary(
                     tb_writer, "train/current_", params.batch_idx_train
diff --git a/egs/librispeech/ASR/zipformer/scaling.py b/egs/librispeech/ASR/zipformer/scaling.py
index 23fd279b31..c0f1e30873 100644
--- a/egs/librispeech/ASR/zipformer/scaling.py
+++ b/egs/librispeech/ASR/zipformer/scaling.py
@@ -70,7 +70,7 @@ def __init__(self, *args):
             self.pairs = list(args[0].pairs)
         else:
             self.pairs = [(float(x), float(y)) for x, y in args]
-        for (x, y) in self.pairs:
+        for x, y in self.pairs:
             assert isinstance(x, (float, int)), type(x)
             assert isinstance(y, (float, int)), type(y)
 
diff --git a/icefall/__init__.py b/icefall/__init__.py
index 05e2b408c1..b1e4313e9b 100644
--- a/icefall/__init__.py
+++ b/icefall/__init__.py
@@ -1,12 +1,6 @@
 # isort:skip_file
 
-from . import (
-    checkpoint,
-    decode,
-    dist,
-    env,
-    utils
-)
+from . import checkpoint, decode, dist, env, utils
 
 from .byte_utils import (
     byte_decode,
diff --git a/icefall/context_graph.py b/icefall/context_graph.py
index 01836df04c..0b7c42c0b9 100644
--- a/icefall/context_graph.py
+++ b/icefall/context_graph.py
@@ -227,7 +227,6 @@ def draw(
         filename: Optional[str] = "",
         symbol_table: Optional[Dict[int, str]] = None,
     ) -> "Digraph":  # noqa
-
         """Visualize a ContextGraph via graphviz.
 
         Render ContextGraph as an image via graphviz, and return the Digraph object;
diff --git a/icefall/diagnostics.py b/icefall/diagnostics.py
index 98870684e5..700dc1500a 100644
--- a/icefall/diagnostics.py
+++ b/icefall/diagnostics.py
@@ -23,6 +23,7 @@
 import torch
 from torch import Tensor, nn
 
+
 class TensorDiagnosticOptions(object):
     """Options object for tensor diagnostics:
 
@@ -77,11 +78,11 @@ def get_tensor_stats(
     elif stats_type == "abs":
         x = x.abs()
     elif stats_type == "rms":
-        x = x ** 2
+        x = x**2
     elif stats_type == "positive":
         x = (x > 0).to(dtype=torch.float)
     else:
-        assert stats_type in [ "value", "max", "min" ]
+        assert stats_type in ["value", "max", "min"]
 
     sum_dims = [d for d in range(x.ndim) if d != dim]
     if len(sum_dims) > 0:
@@ -121,10 +122,10 @@ def __init__(self, opts: TensorDiagnosticOptions, name: str):
         self.class_name = None  # will assign in accumulate()
 
         self.stats = None  # we'll later assign a list to self.stats.
-                           # It's a list of dicts, indexed by dim (i.e. by the
-                           # axis of the tensor).  The dicts, in turn, are
-                           # indexed by `stats-type` which are strings in
-                           # ["abs", "max", "min", "positive", "value", "rms"].
+        # It's a list of dicts, indexed by dim (i.e. by the
+        # axis of the tensor).  The dicts, in turn, are
+        # indexed by `stats-type` which are strings in
+        # ["abs", "max", "min", "positive", "value", "rms"].
 
         # scalar_stats contains some analysis of the activations and gradients,
         self.scalar_stats = None
@@ -139,7 +140,6 @@ def __init__(self, opts: TensorDiagnosticOptions, name: str):
         # only adding a new element to the list if there was a different dim.
         # if the string in the key is "eigs", if we detect a length mismatch we put None as the value.
 
-
     def accumulate(self, x, class_name: Optional[str] = None):
         """
         Accumulate tensors.
@@ -193,17 +193,12 @@ def accumulate(self, x, class_name: Optional[str] = None):
                         done = True
                         break
                 if not done:
-                    if (
-                        this_dim_stats[stats_type] != []
-                        and stats_type == "eigs"
-                    ):
+                    if this_dim_stats[stats_type] != [] and stats_type == "eigs":
                         # >1 size encountered on this dim, e.g. it's a batch or time dimension,
                         # don't accumulat "eigs" stats type, it uses too much memory
                         this_dim_stats[stats_type] = None
                     else:
-                        this_dim_stats[stats_type].append(
-                            TensorAndCount(stats, count)
-                        )
+                        this_dim_stats[stats_type].append(TensorAndCount(stats, count))
 
     def print_diagnostics(self):
         """Print diagnostics for each dimension of the tensor."""
@@ -220,8 +215,11 @@ def print_diagnostics(self):
                     for r, v in zip(rms_stats_list, value_stats_list):
                         stddev_stats_list.append(
                             # r.count and v.count should be the same, but we don't check this.
-                            TensorAndCount(r.tensor - v.tensor * v.tensor / (v.count + 1.0e-20),
-                                           r.count))
+                            TensorAndCount(
+                                r.tensor - v.tensor * v.tensor / (v.count + 1.0e-20),
+                                r.count,
+                            )
+                        )
                     this_dim_stats["stddev"] = stddev_stats_list
 
             for stats_type, stats_list in this_dim_stats.items():
@@ -232,7 +230,6 @@ def print_diagnostics(self):
                     assert stats_type == "eigs"
                     continue
 
-
                 def get_count(count):
                     return 1 if stats_type in ["max", "min"] else count
 
@@ -250,22 +247,20 @@ def get_count(count):
                         eigs, _ = torch.symeig(stats)
                         stats = eigs.abs().sqrt()
                     except:  # noqa
-                        print(
-                            "Error getting eigenvalues, trying another method."
-                        )
+                        print("Error getting eigenvalues, trying another method.")
                         eigs, _ = torch.eig(stats)
                         stats = eigs.norm(dim=1).sqrt()
                         # sqrt so it reflects data magnitude, like stddev- not variance
 
-                if stats_type in [ "rms", "stddev" ]:
+                if stats_type in ["rms", "stddev"]:
                     # we stored the square; after aggregation we need to take sqrt.
                     stats = stats.sqrt()
 
                 # if `summarize` we print percentiles of the stats; else,
                 # we print out individual elements.
-                summarize = (
-                    len(stats_list) > 1
-                ) or self.opts.dim_is_summarized(stats.numel())
+                summarize = (len(stats_list) > 1) or self.opts.dim_is_summarized(
+                    stats.numel()
+                )
                 if summarize:  # usually `summarize` will be true
                     # print out percentiles.
                     stats = stats.sort()[0]
@@ -282,15 +277,15 @@ def get_count(count):
                     ans = stats.tolist()
                     ans = ["%.2g" % x for x in ans]
                     ans = "[" + " ".join(ans) + "]"
-                if stats_type in [ "value", "rms", "stddev", "eigs" ]:
+                if stats_type in ["value", "rms", "stddev", "eigs"]:
                     # This norm is useful because it is strictly less than the largest
                     # sqrt(eigenvalue) of the variance, which we print out, and shows,
                     # speaking in an approximate way, how much of that largest eigenvalue
                     # can be attributed to the mean of the distribution.
-                    norm = (stats ** 2).sum().sqrt().item()
+                    norm = (stats**2).sum().sqrt().item()
                     ans += f", norm={norm:.2g}"
                 mean = stats.mean().item()
-                rms = (stats ** 2).mean().sqrt().item()
+                rms = (stats**2).mean().sqrt().item()
                 ans += f", mean={mean:.3g}, rms={rms:.3g}"
 
                 # OK, "ans" contains the actual stats, e.g.
@@ -298,11 +293,11 @@ def get_count(count):
 
                 sizes = [x.tensor.shape[0] for x in stats_list]
                 size_str = (
-                    f"{sizes[0]}"
-                    if len(sizes) == 1
-                    else f"{min(sizes)}..{max(sizes)}"
+                    f"{sizes[0]}" if len(sizes) == 1 else f"{min(sizes)}..{max(sizes)}"
+                )
+                maybe_class_name = (
+                    f" type={self.class_name}," if self.class_name is not None else ""
                 )
-                maybe_class_name = f" type={self.class_name}," if self.class_name is not None else ""
                 print(
                     f"module={self.name},{maybe_class_name} dim={dim}, size={size_str}, {stats_type} {ans}"
                 )
@@ -330,7 +325,6 @@ def __init__(self, opts: TensorDiagnosticOptions, name: str):
         self.sum_gradsq = None
         self.sum_abs_grad = None
 
-
     def accumulate_input(self, x: Tensor, class_name: Optional[str] = None):
         """
         Called in forward pass.
@@ -347,8 +341,10 @@ def accumulate_input(self, x: Tensor, class_name: Optional[str] = None):
 
         limit = 10
         if len(self.saved_inputs) > limit:
-            print(f"ERROR: forward pass called for this module over {limit} times with no backward pass. "
-                  f" Will not accumulate scalar stats.")
+            print(
+                f"ERROR: forward pass called for this module over {limit} times with no backward pass. "
+                f" Will not accumulate scalar stats."
+            )
             self.is_ok = False
             return
         self.saved_inputs.append(x)
@@ -359,11 +355,15 @@ def accumulate_output_grad(self, grad: Tensor):
         if self.is_forward_pass:
             self.is_forward_pass = False
 
-        last_shape = 'n/a' if len(self.saved_inputs) == 0 else self.saved_inputs[-1].shape
+        last_shape = (
+            "n/a" if len(self.saved_inputs) == 0 else self.saved_inputs[-1].shape
+        )
         if len(self.saved_inputs) == 0 or grad.shape != last_shape:
-            print(f"ERROR: shape mismatch or no forward activation present when backward "
-                  f"pass called: grad shape ={tuple(grad.shape)}, num-saved-inputs={len(self.saved_inputs)}"
-                  f", shape-of-last-saved-input={last_shape}")
+            print(
+                f"ERROR: shape mismatch or no forward activation present when backward "
+                f"pass called: grad shape ={tuple(grad.shape)}, num-saved-inputs={len(self.saved_inputs)}"
+                f", shape-of-last-saved-input={last_shape}"
+            )
             self.is_ok = False
             return
 
@@ -384,11 +384,19 @@ def process_input_and_grad(self, x: Tensor, grad: Tensor):
             self.tick_scale = float(x_abs_sorted[index] / num_ticks_per_side)
 
             # integerize from tick * (-num ticks_per_side ..  num_ticks_per_side - 1]
-            self.counts = torch.zeros(2 * num_ticks_per_side, dtype=torch.long, device=x.device)
-            self.sum_grad = torch.zeros(2 * num_ticks_per_side, dtype=torch.double, device=x.device)
+            self.counts = torch.zeros(
+                2 * num_ticks_per_side, dtype=torch.long, device=x.device
+            )
+            self.sum_grad = torch.zeros(
+                2 * num_ticks_per_side, dtype=torch.double, device=x.device
+            )
             # sum_gradsq is for getting error bars.
-            self.sum_gradsq = torch.zeros(2 * num_ticks_per_side, dtype=torch.double, device=x.device)
-            self.sum_abs_grad = torch.zeros(2 * num_ticks_per_side, dtype=torch.double, device=x.device)
+            self.sum_gradsq = torch.zeros(
+                2 * num_ticks_per_side, dtype=torch.double, device=x.device
+            )
+            self.sum_abs_grad = torch.zeros(
+                2 * num_ticks_per_side, dtype=torch.double, device=x.device
+            )
 
         # this will round down.
         x = (x / self.tick_scale).to(torch.long)
@@ -397,20 +405,21 @@ def process_input_and_grad(self, x: Tensor, grad: Tensor):
 
         self.counts.index_add_(dim=0, index=x, source=torch.ones_like(x))
         self.sum_grad.index_add_(dim=0, index=x, source=grad.to(torch.double))
-        self.sum_gradsq.index_add_(dim=0, index=x, source=(grad*grad).to(torch.double))
+        self.sum_gradsq.index_add_(
+            dim=0, index=x, source=(grad * grad).to(torch.double)
+        )
         self.sum_abs_grad.index_add_(dim=0, index=x, source=grad.abs().to(torch.double))
 
-
     def print_diagnostics(self):
         """Print diagnostics."""
         if self.is_ok is False or self.counts is None:
             print(f"Warning: no stats accumulated for {self.name}, is_ok={self.is_ok}")
             return
 
-        counts = self.counts.to('cpu')
-        sum_grad = self.sum_grad.to(device='cpu', dtype=torch.float32)
-        sum_gradsq = self.sum_gradsq.to(device='cpu', dtype=torch.float32)
-        sum_abs_grad = self.sum_abs_grad.to(device='cpu', dtype=torch.float32)
+        counts = self.counts.to("cpu")
+        sum_grad = self.sum_grad.to(device="cpu", dtype=torch.float32)
+        sum_gradsq = self.sum_gradsq.to(device="cpu", dtype=torch.float32)
+        sum_abs_grad = self.sum_abs_grad.to(device="cpu", dtype=torch.float32)
 
         counts_cumsum = counts.cumsum(dim=0)
         counts_tot = counts_cumsum[-1]
@@ -433,19 +442,22 @@ def print_diagnostics(self):
         bin_abs_grad = torch.zeros(num_bins)
         bin_abs_grad.index_add_(dim=0, index=bin_indexes, source=sum_abs_grad)
 
-        avg_grad = (bin_grad / bin_counts)
+        avg_grad = bin_grad / bin_counts
         avg_grad_stddev = (bin_gradsq / bin_counts).sqrt()
 
-        bin_boundary_counts = torch.arange(num_bins + 1, dtype=torch.long) * counts_per_bin
+        bin_boundary_counts = (
+            torch.arange(num_bins + 1, dtype=torch.long) * counts_per_bin
+        )
         bin_tick_indexes = torch.searchsorted(counts_cumsum, bin_boundary_counts)
         # boundaries are the "x" values between the bins, e.g. corresponding to the
         # locations of percentiles of the distribution.
         num_ticks_per_side = counts.numel() // 2
         bin_boundaries = (bin_tick_indexes - num_ticks_per_side) * self.tick_scale
 
-
         bin_grad = bin_grad / (bin_counts + 1)
-        bin_conf_interval = bin_gradsq.sqrt() / (bin_counts + 1) # consider this a standard deviation.
+        bin_conf_interval = bin_gradsq.sqrt() / (
+            bin_counts + 1
+        )  # consider this a standard deviation.
         # bin_grad / bin_abs_grad will give us a sense for how important in a practical sense,
         # the gradients are.
         bin_abs_grad = bin_abs_grad / (bin_counts + 1)
@@ -458,8 +470,9 @@ def tensor_to_str(x: Tensor):
             x = "[" + " ".join(x) + "]"
             return x
 
-
-        maybe_class_name = f" type={self.class_name}," if self.class_name is not None else ""
+        maybe_class_name = (
+            f" type={self.class_name}," if self.class_name is not None else ""
+        )
 
         print(
             f"module={self.name},{maybe_class_name} bin-boundaries={tensor_to_str(bin_boundaries)}, "
@@ -467,7 +480,6 @@ def tensor_to_str(x: Tensor):
         )
 
 
-
 class ModelDiagnostic(object):
     """This class stores diagnostics for all tensors in the torch.nn.Module.
 
@@ -485,9 +497,8 @@ def __init__(self, opts: Optional[TensorDiagnosticOptions] = None):
             self.opts = opts
         self.diagnostics = dict()
 
-
     def __getitem__(self, name: str):
-        T = ScalarDiagnostic if name[-7:] == '.scalar' else TensorDiagnostic
+        T = ScalarDiagnostic if name[-7:] == ".scalar" else TensorDiagnostic
         if name not in self.diagnostics:
             self.diagnostics[name] = T(self.opts, name)
         return self.diagnostics[name]
@@ -502,18 +513,19 @@ def get_class_name(module: nn.Module):
     ans = type(module).__name__
     # we put the below in try blocks in case anyone is using a different version of these modules that
     # might have different member names.
-    if ans == 'Balancer' or ans == 'ActivationBalancer':
+    if ans == "Balancer" or ans == "ActivationBalancer":
         try:
-            ans += f'[{float(module.min_positive)},{float(module.max_positive)},{float(module.min_abs)},{float(module.max_abs)}]'
+            ans += f"[{float(module.min_positive)},{float(module.max_positive)},{float(module.min_abs)},{float(module.max_abs)}]"
         except:
             pass
-    elif ans == 'AbsValuePenalizer':
+    elif ans == "AbsValuePenalizer":
         try:
-            ans += f'[{module.limit}]'
+            ans += f"[{module.limit}]"
         except:
             pass
     return ans
 
+
 def attach_diagnostics(
     model: nn.Module, opts: Optional[TensorDiagnosticOptions] = None
 ) -> ModelDiagnostic:
@@ -538,73 +550,85 @@ def attach_diagnostics(
         if name == "":
             name = "<top-level>"
 
-
-
         # Setting model_diagnostic=ans and n=name below, instead of trying to
         # capture the variables, ensures that we use the current values.
         # (this matters for `name`, since the variable gets overwritten).
         # These closures don't really capture by value, only by
         # "the final value the variable got in the function" :-(
-        def forward_hook(
-            _module, _input, _output, _model_diagnostic=ans, _name=name
-        ):
+        def forward_hook(_module, _input, _output, _model_diagnostic=ans, _name=name):
             if isinstance(_output, tuple) and len(_output) == 1:
                 _output = _output[0]
 
-            if isinstance(_output, Tensor) and _output.dtype in ( torch.float32, torch.float16, torch.float64 ):
-                _model_diagnostic[f"{_name}.output"].accumulate(_output,
-                                                                class_name=get_class_name(_module))
+            if isinstance(_output, Tensor) and _output.dtype in (
+                torch.float32,
+                torch.float16,
+                torch.float64,
+            ):
+                _model_diagnostic[f"{_name}.output"].accumulate(
+                    _output, class_name=get_class_name(_module)
+                )
             elif isinstance(_output, tuple):
                 for i, o in enumerate(_output):
-                    if o.dtype in ( torch.float32, torch.float16, torch.float64 ):
-                        _model_diagnostic[f"{_name}.output[{i}]"].accumulate(o,
-                                                                             class_name=get_class_name(_module))
+                    if o.dtype in (torch.float32, torch.float16, torch.float64):
+                        _model_diagnostic[f"{_name}.output[{i}]"].accumulate(
+                            o, class_name=get_class_name(_module)
+                        )
 
-        def backward_hook(
-            _module, _input, _output, _model_diagnostic=ans, _name=name
-        ):
+        def backward_hook(_module, _input, _output, _model_diagnostic=ans, _name=name):
             if isinstance(_output, tuple) and len(_output) == 1:
                 _output = _output[0]
-            if isinstance(_output, Tensor) and _output.dtype in ( torch.float32, torch.float16, torch.float64 ):
-                _model_diagnostic[f"{_name}.grad"].accumulate(_output,
-                                                              class_name=get_class_name(_module))
+            if isinstance(_output, Tensor) and _output.dtype in (
+                torch.float32,
+                torch.float16,
+                torch.float64,
+            ):
+                _model_diagnostic[f"{_name}.grad"].accumulate(
+                    _output, class_name=get_class_name(_module)
+                )
             elif isinstance(_output, tuple):
                 for i, o in enumerate(_output):
-                    if o.dtype in ( torch.float32, torch.float16, torch.float64 ):
-                        _model_diagnostic[f"{_name}.grad[{i}]"].accumulate(o,
-                                                                           class_name=get_class_name(_module))
-
+                    if o.dtype in (torch.float32, torch.float16, torch.float64):
+                        _model_diagnostic[f"{_name}.grad[{i}]"].accumulate(
+                            o, class_name=get_class_name(_module)
+                        )
 
         module.register_forward_hook(forward_hook)
         module.register_backward_hook(backward_hook)
 
-        if type(module).__name__ in  ["Sigmoid", "Tanh", "ReLU", "TanSwish", "Swish", "DoubleSwish", "Swoosh"]:
+        if type(module).__name__ in [
+            "Sigmoid",
+            "Tanh",
+            "ReLU",
+            "TanSwish",
+            "Swish",
+            "DoubleSwish",
+            "Swoosh",
+        ]:
             # For these specific module types, accumulate some additional diagnostics
             # that can help us improve the activation function.  These require a lot of memory,
             # to save the forward activations, so limit this to some select classes.
             # Note: this will not work correctly for all model types.
             def scalar_forward_hook(
-                    _module, _input, _output, _model_diagnostic=ans, _name=name
+                _module, _input, _output, _model_diagnostic=ans, _name=name
             ):
                 if isinstance(_input, tuple):
-                    _input, = _input
+                    (_input,) = _input
                 assert isinstance(_input, Tensor)
-                _model_diagnostic[f"{_name}.scalar"].accumulate_input(_input,
-                                                                      class_name=get_class_name(_module))
+                _model_diagnostic[f"{_name}.scalar"].accumulate_input(
+                    _input, class_name=get_class_name(_module)
+                )
 
             def scalar_backward_hook(
-                    _module, _input, _output, _model_diagnostic=ans, _name=name
+                _module, _input, _output, _model_diagnostic=ans, _name=name
             ):
                 if isinstance(_output, tuple):
-                    _output, = _output
+                    (_output,) = _output
                 assert isinstance(_output, Tensor)
                 _model_diagnostic[f"{_name}.scalar"].accumulate_output_grad(_output)
 
             module.register_forward_hook(scalar_forward_hook)
             module.register_backward_hook(scalar_backward_hook)
 
-
-
     for name, parameter in model.named_parameters():
 
         def param_backward_hook(
diff --git a/icefall/profiler.py b/icefall/profiler.py
index dc76ebebc5..49e1385799 100644
--- a/icefall/profiler.py
+++ b/icefall/profiler.py
@@ -70,25 +70,17 @@ def pre_hook(module, input):
                 module_flop_count.append([])
 
             if not hasattr(module, "__pre_hook_handle__"):
-                module.__pre_hook_handle__ = module.register_forward_pre_hook(
-                    pre_hook
-                )
+                module.__pre_hook_handle__ = module.register_forward_pre_hook(pre_hook)
 
             def post_hook(module, input, output):
                 if module_flop_count:
-                    module.__flops__ += sum(
-                        [elem[1] for elem in module_flop_count[-1]]
-                    )
+                    module.__flops__ += sum([elem[1] for elem in module_flop_count[-1]])
                     module_flop_count.pop()
 
             if not hasattr(module, "__post_hook_handle__"):
-                module.__post_hook_handle__ = module.register_forward_hook(
-                    post_hook
-                )
+                module.__post_hook_handle__ = module.register_forward_hook(post_hook)
 
-        self.model.apply(
-            partial(register_module_hooks, ignore_list=ignore_list)
-        )
+        self.model.apply(partial(register_module_hooks, ignore_list=ignore_list))
         self.started = True
         self.func_patched = True
 
@@ -194,9 +186,7 @@ def _prelu_flops_compute(input: Tensor, weight: Tensor):
     return input.numel()
 
 
-def _elu_flops_compute(
-    input: Tensor, alpha: float = 1.0, inplace: bool = False
-):
+def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False):
     return input.numel()
 
 
@@ -259,9 +249,7 @@ def _conv_flops_compute(
         output_dims.append(output_dim)
 
     filters_per_channel = out_channels // groups
-    conv_per_position_macs = (
-        int(_prod(kernel_dims)) * in_channels * filters_per_channel
-    )
+    conv_per_position_macs = int(_prod(kernel_dims)) * in_channels * filters_per_channel
     active_elements_count = batch_size * int(_prod(output_dims))
     overall_conv_macs = conv_per_position_macs * active_elements_count
     overall_conv_flops = 2 * overall_conv_macs
@@ -297,7 +285,6 @@ def _conv_trans_flops_compute(
 
     output_dims = []
     for idx, input_dim in enumerate(input_dims):
-
         output_dim = (
             input_dim
             + 2 * paddings[idx]
@@ -310,9 +297,7 @@ def _conv_trans_flops_compute(
     dilations = dilation if type(dilation) is tuple else (dilation, dilation)
 
     filters_per_channel = out_channels // groups
-    conv_per_position_macs = (
-        int(_prod(kernel_dims)) * in_channels * filters_per_channel
-    )
+    conv_per_position_macs = int(_prod(kernel_dims)) * in_channels * filters_per_channel
     active_elements_count = batch_size * int(_prod(input_dims))
     overall_conv_macs = conv_per_position_macs * active_elements_count
     overall_conv_flops = 2 * overall_conv_macs
@@ -389,9 +374,7 @@ def _upsample_flops_compute(input, **kwargs):
         else:
             return int(size), 0
     scale_factor = kwargs.get("scale_factor", None)
-    assert (
-        scale_factor is not None
-    ), "either size or scale_factor should be defined"
+    assert scale_factor is not None, "either size or scale_factor should be defined"
     flops = input.numel()
     if isinstance(scale_factor, tuple) and len(scale_factor) == len(input):
         flops * int(_prod(scale_factor))
@@ -593,12 +576,8 @@ def _patch_functionals():
     F.embedding = wrapFunc(F.embedding, _embedding_flops_compute)
 
     # swoosh functions in k2
-    k2.swoosh_l_forward = wrapFunc(
-        k2.swoosh_l_forward, _k2_swoosh_flops_compute
-    )
-    k2.swoosh_r_forward = wrapFunc(
-        k2.swoosh_r_forward, _k2_swoosh_flops_compute
-    )
+    k2.swoosh_l_forward = wrapFunc(k2.swoosh_l_forward, _k2_swoosh_flops_compute)
+    k2.swoosh_r_forward = wrapFunc(k2.swoosh_r_forward, _k2_swoosh_flops_compute)
     k2.swoosh_l = wrapFunc(k2.swoosh_l, _k2_swoosh_flops_compute)
     k2.swoosh_r = wrapFunc(k2.swoosh_r, _k2_swoosh_flops_compute)
 
@@ -612,9 +591,7 @@ def _patch_tensor_methods():
     torch.Tensor.bmm = wrapFunc(torch.Tensor.bmm, _matmul_flops_compute)
 
     torch.addmm = wrapFunc(torch.addmm, _addmm_flops_compute)
-    torch.Tensor.addmm = wrapFunc(
-        torch.Tensor.addmm, _tensor_addmm_flops_compute
-    )
+    torch.Tensor.addmm = wrapFunc(torch.Tensor.addmm, _tensor_addmm_flops_compute)
 
     torch.mul = wrapFunc(torch.mul, _mul_flops_compute)
     torch.Tensor.mul = wrapFunc(torch.Tensor.mul, _mul_flops_compute)
@@ -631,14 +608,10 @@ def _patch_tensor_methods():
 
     torch.tanh = wrapFunc(torch.tanh, _tanh_flops_compute)
 
-    torch.Tensor.softmax = wrapFunc(
-        torch.Tensor.softmax, _softmax_flops_compute
-    )
+    torch.Tensor.softmax = wrapFunc(torch.Tensor.softmax, _softmax_flops_compute)
 
     torch.sigmoid = wrapFunc(torch.sigmoid, _sigmoid_flops_compute)
-    torch.Tensor.sigmoid = wrapFunc(
-        torch.Tensor.sigmoid, _sigmoid_flops_compute
-    )
+    torch.Tensor.sigmoid = wrapFunc(torch.Tensor.sigmoid, _sigmoid_flops_compute)
 
 
 def _reload_functionals():
@@ -732,15 +705,11 @@ def _rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
         flops += rnn_module.hidden_size * 4
         # two hadamard _product and add for C state
         flops += (
-            rnn_module.hidden_size
-            + rnn_module.hidden_size
-            + rnn_module.hidden_size
+            rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size
         )
         # final hadamard
         flops += (
-            rnn_module.hidden_size
-            + rnn_module.hidden_size
-            + rnn_module.hidden_size
+            rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size
         )
     return flops
 
diff --git a/icefall/rnn_lm/check-onnx-streaming.py b/icefall/rnn_lm/check-onnx-streaming.py
index d51a4b76b0..28b908f828 100755
--- a/icefall/rnn_lm/check-onnx-streaming.py
+++ b/icefall/rnn_lm/check-onnx-streaming.py
@@ -112,7 +112,6 @@ def main():
         for torch_v, onnx_v in zip(
             (torch_log_prob, torch_h0, torch_c0), (onnx_log_prob, onnx_h0, onnx_c0)
         ):
-
             assert torch.allclose(torch_v, onnx_v, atol=1e-5), (
                 torch_v.shape,
                 onnx_v.shape,
diff --git a/icefall/rnn_lm/train.py b/icefall/rnn_lm/train.py
index 3d206d1397..0178b80bfc 100755
--- a/icefall/rnn_lm/train.py
+++ b/icefall/rnn_lm/train.py
@@ -463,7 +463,6 @@ def train_one_epoch(
     cur_batch_idx = params.get("cur_batch_idx", 0)
 
     for batch_idx, batch in enumerate(train_dl):
-
         if batch_idx < cur_batch_idx:
             continue
         cur_batch_idx = batch_idx
diff --git a/icefall/shared/make_kn_lm.py b/icefall/shared/make_kn_lm.py
index 7150297d6c..231aca7f19 100755
--- a/icefall/shared/make_kn_lm.py
+++ b/icefall/shared/make_kn_lm.py
@@ -225,7 +225,6 @@ def cal_f(self):
         for n in range(0, self.ngram_order - 1):
             this_order_counts = self.counts[n]
             for hist, counts_for_hist in this_order_counts.items():
-
                 n_star_star = 0
                 for w in counts_for_hist.word_to_count.keys():
                     n_star_star += len(counts_for_hist.word_to_context[w])
@@ -424,7 +423,6 @@ def print_as_arpa(
 
 
 if __name__ == "__main__":
-
     ngram_counts = NgramCounts(args.ngram_order)
 
     if args.text is None:
diff --git a/icefall/transformer_lm/model.py b/icefall/transformer_lm/model.py
index 79dda31682..c78cf1821d 100644
--- a/icefall/transformer_lm/model.py
+++ b/icefall/transformer_lm/model.py
@@ -103,7 +103,6 @@ def forward(
         return nll_loss
 
     def score_token(self, x: torch.Tensor, x_lens: torch.Tensor, state=None):
-
         bs = x.size(0)
 
         state = None
diff --git a/requirements-ci.txt b/requirements-ci.txt
index 2433e190bc..652e2ab471 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -20,6 +20,7 @@ kaldialign==0.7.1
 sentencepiece==0.1.96
 tensorboard==2.8.0
 typeguard==2.13.3
+black==22.3.0
 multi_quantization
 
 onnx
diff --git a/requirements.txt b/requirements.txt
index a07f6b7c7a..f0098c2364 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ sentencepiece>=0.1.96
 tensorboard
 typeguard
 dill
+black==22.3.0