in chunking/chunkers/transcription_chunker.py [0:0]
def _vtt_process(self):
blob_data = self.document_bytes
blob_stream = BytesIO(blob_data)
vtt = webvtt.read_buffer(blob_stream)
data, text, voice = [], "", ""
for caption in vtt:
current_voice = caption.voice or ""
if current_voice != voice:
if text:
data.append(text.replace("\n", " "))
voice, text = current_voice, f"{voice}: {caption.text} " if voice else caption.text + " "
else:
text += caption.text + " "
if text:
data.append(text.replace("\n", " "))
return "\n".join(data).strip()