From f0083e7eb20d032390e42f6f6039947fa8669c93 Mon Sep 17 00:00:00 2001 From: Umar Farooqi Date: Tue, 17 Jan 2023 17:43:05 -0500 Subject: [PATCH] Use ndimage.median_filter instead of signal.medfilter (#812) For a 30s long audio file which didn't have any silence, ndimage.median_filter took 7s where signa.medfilter took 30s. Co-authored-by: Umar Farooqi Co-authored-by: Jong Wook Kim --- notebooks/Multilingual_ASR.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/Multilingual_ASR.ipynb b/notebooks/Multilingual_ASR.ipynb index 4ab7ab40a..2d32e0e02 100644 --- a/notebooks/Multilingual_ASR.ipynb +++ b/notebooks/Multilingual_ASR.ipynb @@ -874,7 +874,7 @@ "from IPython.display import display, HTML\n", "from whisper.tokenizer import get_tokenizer\n", "from dtw import dtw\n", - "from scipy.signal import medfilt\n", + "from scipy.ndimage import median_filter\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = \"retina\"" @@ -3610,7 +3610,7 @@ "\n", " weights = torch.cat(QKs) # layers * heads * tokens * frames \n", " weights = weights[:, :, :, : duration // AUDIO_SAMPLES_PER_TOKEN].cpu()\n", - " weights = medfilt(weights, (1, 1, 1, medfilt_width))\n", + " weights = median_filter(weights, (1, 1, 1, medfilt_width))\n", " weights = torch.tensor(weights * qk_scale).softmax(dim=-1)\n", " \n", " w = weights / weights.norm(dim=-2, keepdim=True)\n",