in data_preparation/split_librilight/prepare_vads.py [0:0]
def split_vad(silence_probs: List[float], p_silence_threshold: float, len_threshold: int) -> List[Tuple[int, int]]:
"""Given a sequence `p_probs` of silence probabilities p, this function
returns intervals of speech activity, such that (a) those intervals are separated by
at least `len_threshold` of silent frames (p > `p_silence_threshold`),
(b) are themselves longer than `len_threshold`.
Arguments:
silence_probs -- list of silence probabilities
p_silence_threshold -- all frames with silence probability above this thresholds
are considered as silence
len_threshold -- minimal length of silence and non-silence segments
Returns: list of tuples (start_speech_frame, first_silence_frame_after_start or end_of_sequence)
"""
segments = []
start = None
i = 0
n = len(silence_probs)
while i < len(silence_probs) and silence_probs[i] > p_silence_threshold:
i += 1
# supported invariants: `start` points to the frame where speech starts, i >= start
start = i
while i < n:
# scroll until first silence frame
if silence_probs[i] < p_silence_threshold:
i += 1
continue
# now i points to the first silence frame
# look ahead: do we have at least len_threshold silence frames?
all_silence = True
for j in range(i + 1, min(i + len_threshold, n)):
all_silence = all_silence and silence_probs[j] > p_silence_threshold
if not all_silence:
break
if not all_silence:
# no we don't: disregard the silence, go further
# starting from the first non-silence frame
i = j
else:
# we do have enough silence for a split
if i - start > len_threshold:
segments.append((start, i))
while i < n and silence_probs[i] > p_silence_threshold:
i += 1
start = i
i += 1
if i - start > len_threshold and start < n:
segments.append((start, i))
return segments