From 61a94594d009d8c72b5eda125632f1d3f8b71ec0 Mon Sep 17 00:00:00 2001 From: Lukas Prediger Date: Sat, 22 Jul 2023 17:31:50 +0300 Subject: [PATCH] Significant runtime improvements to detect_silence Using numpy to compute RMS for silence detection to reduce redundant computation (and benefit from numpys highly optimized implementation) compared to previous implementation of detect_silence. Some caveats: - adds numpy as new dependency - previously RMS values where rounded down to the next integer; this is now not the case anymore, resulting in borders of silence ranges to possibly vary slightly compared to previous implementation --- pydub/silence.py | 135 +++++++++++++++++++++++++++++++++++++++-------- setup.py | 3 ++ 2 files changed, 115 insertions(+), 23 deletions(-) diff --git a/pydub/silence.py b/pydub/silence.py index 1f18683e..a3b4ad5d 100644 --- a/pydub/silence.py +++ b/pydub/silence.py @@ -2,73 +2,162 @@ Various functions for finding/manipulating silence in AudioSegments """ import itertools +import numpy as np from .utils import db_to_float -def detect_silence(audio_segment, min_silence_len=1000, silence_thresh=-16, seek_step=1): +def _convert_to_numpy(audio_segment): + """ + Returns a numpy array view of the raw samples of an AudioSegment, + with shape (number of frames, channels). + + Does not allocate any additional memory. + + audio_segment - the segment to convert into a numpy array + """ + dtype = { + 1: np.int8, + 2: np.int16, + 4: np.int32 + }[audio_segment.sample_width] + x = np.frombuffer(audio_segment.raw_data, dtype=dtype).reshape(-1, audio_segment.channels) + return x + + +def detect_silence(audio_segment, min_silence_len=1000, silence_thresh=-16, seek_step=1, max_buffer_size_kb=100*1024): """ Returns a list of all silent sections [start, end] in milliseconds of audio_segment. - Inverse of detect_nonsilent() + Inverse of detect_nonsilent(). audio_segment - the segment to find silence in min_silence_len - the minimum length for any silent section silence_thresh - the upper bound for how quiet is silent in dFBS seek_step - step size for interating over the segment in ms + max_buffer_size_kb - the maximum size of internally allocated buffers in KiB """ - seg_len = len(audio_segment) - + raw_data = _convert_to_numpy(audio_segment) + min_silence_ms = min_silence_len + silence_threshold_db = silence_thresh + seek_step_ms = seek_step + + seg_len_ms = len(audio_segment) + frames_per_ms = audio_segment.frame_rate / 1000 + + assert raw_data.shape[0] == audio_segment.frame_count() + assert raw_data.shape[1] == audio_segment.channels + + max_frames_in_slice = int(np.ceil(min_silence_ms * frames_per_ms)) + + # determine number of frames in computation window buffer + if max_buffer_size_kb >= 0: + bytes_per_frame = 8 + frames_per_kb = 1024 // bytes_per_frame + buffer_len = max_buffer_size_kb * frames_per_kb + # empirical testing shows that we need approximately 4 times as much memory + # as buffer_len would suggest, probably because numpy allocates additional buffers + # in the background during computations; we correct for this by adjusting buffer_len accordingly + correction_constant = 4 + buffer_len //= correction_constant + if buffer_len < max_frames_in_slice: + min_buffer_size = int(np.ceil(max_frames_in_slice * bytes_per_frame / 1024)) * correction_constant + raise ValueError("Buffer is too small, must be at least {} for the given {}" % min_buffer_size, min_silence_ms) + else: + buffer_len = len(raw_data) # no restrictions! + + # you can't have a silent portion of a sound that is longer than the sound - if seg_len < min_silence_len: + if seg_len_ms < min_silence_ms: return [] - + # convert silence threshold to a float value (so we can compare it to rms) - silence_thresh = db_to_float(silence_thresh) * audio_segment.max_possible_amplitude + normalization_const = float(audio_segment.max_possible_amplitude) + # normalization_const = 1. + silence_thresh = db_to_float(silence_threshold_db) * audio_segment.max_possible_amplitude / normalization_const # check successive (1 sec by default) chunk of sound for silence # try a chunk at every "seek step" (or every chunk for a seek step == 1) - last_slice_start = seg_len - min_silence_len - slice_starts = range(0, last_slice_start + 1, seek_step) + last_slice_start = seg_len_ms - min_silence_ms + slice_starts = range(0, last_slice_start + 1, seek_step_ms) # guarantee last_slice_start is included in the range # to make sure the last portion of the audio is searched - if last_slice_start % seek_step: + if last_slice_start % seek_step_ms: slice_starts = itertools.chain(slice_starts, [last_slice_start]) # list of all continuous regions of silence (start ms - end ms) silent_ranges = [] - prev_silent_i = None + prev_silent_ms = None current_range_start = None - # loop over audio to detect slices of silence - for i in slice_starts: - audio_slice = audio_segment[i:i + min_silence_len] - if audio_slice.rms <= silence_thresh: + # load first window into buffer + # the cumsq_per_frame buffer holds the cumulative sum of means of squares over channels for each frame, + # normalized by max_possible_amplitude (i.e., all possible values x are 0 <= x <= 1) - this is + # to prevent cumulative sums of square from exceeding representable values + cumsq_per_frame = np.concatenate(( + [0.], + np.cumsum( + np.mean((raw_data[0:buffer_len].astype(np.float64))**2, axis=-1) + / (normalization_const**2) + ) + )) + # keep track of the frames currently in the buffer + buffer_offset = 0 + buffer_end = buffer_len + + for slice_start_ms in slice_starts: + slice_start = int(slice_start_ms * frames_per_ms) + slice_end = min(int((slice_start_ms + min_silence_ms) * frames_per_ms), len(raw_data)) + assert slice_end <= len(raw_data) + # if the frame_rate is not divisible by min_silence_ms, we may have frames with slightly varying lengths + # so we compute the actual length of the concrete slice here + frames_in_slice = slice_end - slice_start + + if slice_end > buffer_end: # we ran out of buffer; load next window into buffer + cumsq_per_frame = np.concatenate(( + [0.], + np.cumsum( + np.mean( + (raw_data[slice_start:slice_start + buffer_len].astype(np.float64))**2, + axis=-1 + ) / (normalization_const**2) + ) + + )) + buffer_offset = slice_start + buffer_end = buffer_offset + buffer_len + + # compute the RMS for the current slice from the cumulative sums of squares in the buffer + slice_msq = cumsq_per_frame[slice_end - buffer_offset] - cumsq_per_frame[slice_start - buffer_offset] + rms = np.sqrt(slice_msq / frames_in_slice) + + if rms <= silence_thresh: + # silence_starts.append(slice_start_ms) # current slice is silent; combine with preceeding silent slice if no nonsilent gap if current_range_start is None: - current_range_start = i + current_range_start = slice_start_ms else: - continuous = (i == prev_silent_i + seek_step) + continuous = (slice_start_ms == prev_silent_ms + seek_step) # sometimes two small blips are enough for one particular slice to be # non-silent, despite the silence all running together. Just combine # the two overlapping silent ranges. - silence_has_gap = i > (prev_silent_i + min_silence_len) + silence_has_gap = slice_start_ms > (prev_silent_ms + min_silence_len) if not continuous and silence_has_gap: silent_ranges.append([ current_range_start, - prev_silent_i + min_silence_len + prev_silent_ms + min_silence_len ]) - current_range_start = i + current_range_start = slice_start_ms - prev_silent_i = i + prev_silent_ms = slice_start_ms if current_range_start is not None: - assert prev_silent_i is not None + assert prev_silent_ms is not None silent_ranges.append([current_range_start, - prev_silent_i + min_silence_len]) + prev_silent_ms + min_silence_len]) return silent_ranges diff --git a/setup.py b/setup.py index 7a2fc152..dba36922 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,9 @@ keywords='audio sound high-level', url='http://pydub.com', packages=['pydub'], + install_requires=[ + "numpy >= 1.16, < 2.0", + ], long_description=__doc__, classifiers=[ 'Development Status :: 5 - Production/Stable',