in utils/generate_subtitles.py [0:0]
def generate_subtitles(language: str, youtube_language_code: str = None, is_task_playlist: bool = False):
metadata = []
formatter = SRTFormatter()
path = Path(f"subtitles/{language}")
path.mkdir(parents=True, exist_ok=True)
if is_task_playlist:
playlist_videos = Playlist.getVideos(TASK_VIDEOS_PLAYLIST)
else:
playlist_videos = Playlist.getVideos(COURSE_VIDEOS_PLAYLIST)
for idx, video in enumerate(playlist_videos["videos"]):
video_id = video["id"]
title = video["title"]
title_formatted = title.lower().replace(" ", "-").replace(":", "").replace("?", "")
id_str = f"{idx}".zfill(2)
if is_task_playlist:
srt_filename = f"{path}/tasks_{id_str}_{title_formatted}.srt"
else:
srt_filename = f"{path}/{id_str}_{title_formatted}.srt"
# Skip course events
if "Event Day" in title:
continue
# Skip task videos that don't belong to the course
if video_id in TASK_VIDEOS_TO_SKIP:
continue
# Get transcript
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
english_transcript = transcript_list.find_transcript(language_codes=["en", "en-US"])
languages = pd.DataFrame(english_transcript.translation_languages)["language_code"].tolist()
# Map mismatched language codes
if language not in languages:
if youtube_language_code is None:
raise ValueError(
f"Language code {language} not found in YouTube's list of supported language: {languages}. Please provide a value for `youtube_language_code` and try again."
)
language_code = youtube_language_code
else:
language_code = language
try:
translated_transcript = english_transcript.translate(language_code)
translated_transcript = translated_transcript.fetch()
srt_formatted = formatter.format_transcript(translated_transcript)
with open(srt_filename, "w", encoding="utf-8") as f:
f.write(srt_formatted)
except:
print(f"Problem generating transcript for {title} with ID {video_id} at {video['link']}.")
with open(srt_filename, "w", encoding="utf-8") as f:
f.write("No transcript found for this video!")
metadata.append({"id": video_id, "title": title, "link": video["link"], "srt_filename": srt_filename})
df = pd.DataFrame(metadata)
if is_task_playlist:
df.to_csv(f"{path}/metadata_tasks.csv", index=False)
else:
df.to_csv(f"{path}/metadata.csv", index=False)