prompts/khanacademy/khan_dl/khan_dl.py (321 lines of code) (raw):
# Adapted from https://github.com/rand-net/khan-dl
import logging
import os
import platform
import sys
from typing import List, Tuple
import requests
from bs4 import BeautifulSoup
from prompt_toolkit import prompt
from prompt_toolkit.completion import FuzzyWordCompleter
VIDEO_SITE_URL = "https://www.youtube.com/watch?v="
ROOT_URL = "https://www.khanacademy.org"
DOMAINS = [
"math",
"science",
"computing",
"humanities",
"economics-finance-domain",
"ela",
]
# Tags and attributes for parsing HTML
COURSE_HEAD = {"tag": "h2", "class": "_t2uf76"}
COURSE_URL = {"tag": "a", "class": "_dwmetq"}
COURSE_TITLE = {"data-test-id": "course-unit-title"}
COURSE_UNIT_TITLE = {"data-test-id": "unit-header"}
COURSE_SUBUNIT_TITLE_ATTRS = {"data-test-id": "lesson-card-link"}
COURSE_SUBUNIT_BODY = {"tag": "ul", "class": "_37mhyh"}
COURSE_LESSON_BODY = {"tag": "div", "class_i": "_10ct3cvu", "class_ii": "_1p9458yw"}
COURSE_LESSON_SPAN = {"tag": "span", "class": "_e296pg"}
COURSE_LESSON_LABEL = "aria-label"
COURSE_LESSON_TITLE = {"tag": "span", "class": "_14hvi6g8"}
"""
Domain -> Course -> Unit Page -> Subunit Header + Subunit Block -> Lesson Block -> Lesson Title
"""
def clear_screen():
if platform.system() == "Linux" or platform.system() == "Darwin":
os.system("clear")
elif platform.system() == "Windows":
os.system("cls")
# Youtube-dl NoLogger
class MyLogger(object):
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
class KhanDL:
def __init__(self):
self.domain = ""
self.course_url = ""
self.course_title = ""
self.course_page = ""
self.course_unit_titles = []
self.course_unit_slugs = []
self.course_unit_urls = []
self.course_all_slugs = []
self.lesson_titles = []
self.lesson_youtube_ids = []
self.output_rel_path = os.getcwd() + "/"
self.unit_ids_counter = {}
self.unit_slugs_counter = {}
self.nested_courses = []
self.course_subunits = []
self.selected_course = ""
def get_courses(self, selected_domain_url: str) -> Tuple[List[str], List[str]]:
"""Returns the list of courses on a domain"""
courses, courses_url = [], []
print("\nDownloading Courses...\n")
try:
selected_domain_page = BeautifulSoup(
requests.get(selected_domain_url).text, "lxml"
)
except requests.ConnectionError as e:
print("Error Connecting!\n", e)
sys.exit(1)
except requests.exceptions.HTTPError as errh:
print("Http Error:", errh)
sys.exit(1)
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
sys.exit(1)
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
sys.exit(1)
except requests.exceptions.RequestException as err:
print("OOps: Something Else", err)
sys.exit(1)
for course_header in selected_domain_page.find_all(
COURSE_HEAD["tag"], class_=COURSE_HEAD["class"]
):
course = course_header.find(
COURSE_URL["tag"], class_=COURSE_URL["class"]
).text
courses.append(course)
course_link = course_header.find(
COURSE_URL["tag"], class_=COURSE_URL["class"]
)
course_slug = course_link["href"]
courses_url.append(ROOT_URL + course_slug)
return courses, courses_url
def domain_prompt(self):
"""Returns the selected domain"""
# Domain selection prompt
domain_completer = FuzzyWordCompleter(
list(map(str.title, DOMAINS))
) # Titlecase for aesthetics
selected_domain = DOMAINS.index(
prompt("Domain: ", completer=domain_completer).lower()
)
print("Selected Domain: {}".format(DOMAINS[selected_domain]))
self.domain = DOMAINS[selected_domain]
logging.info("Domain Selected")
def course_prompt(self):
"""Returns URL for the selected course"""
selected_domain_url = ROOT_URL + "/" + self.domain
courses, courses_url = self.get_courses(selected_domain_url)
# Course Selection Prompt
logging.debug(courses)
courses_completer = FuzzyWordCompleter(courses)
selected_course_index = courses.index(
prompt("Course: ", completer=courses_completer)
)
self.selected_course = courses[selected_course_index]
print("Selected Course: {}".format(self.selected_course))
self.course_url = courses_url[selected_course_index]
logging.info("Course Selected")
def get_all_courses(self) -> List[str]:
"""Returns URL for all courses"""
print("Downloading all Courses from all Domains...")
all_courses_url = []
for domain in DOMAINS:
print("Selected Domain: ", domain)
selected_domain_url = ROOT_URL + "/" + domain
courses, courses_url = self.get_courses(selected_domain_url)
all_courses_url += courses_url
return all_courses_url
def get_course_page(self):
"""Retrieves course page html"""
print("Course URL: {}".format(self.course_url))
try:
self.course_page = BeautifulSoup(requests.get(self.course_url).text, "lxml")
except requests.ConnectionError as e:
print("Error Connecting!\n", e)
sys.exit(1)
except requests.exceptions.HTTPError as errh:
print("Http Error:", errh)
sys.exit(1)
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
sys.exit(1)
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
sys.exit(1)
except requests.exceptions.RequestException as err:
print("Oops: Something Else", err)
sys.exit(1)
def get_course_title(self):
"""Retrieves the course title"""
course_title = self.course_page.find(attrs=COURSE_TITLE)
if course_title and course_title.text:
self.course_title = course_title.text.replace(" ", "_")
logging.debug("course_title:{}".format(self.course_title))
logging.info("Course title retrieved")
def get_course_unit_titles(self):
"""Retrieves course unit titles"""
self.course_unit_titles = []
for title in self.course_page.find_all(attrs=COURSE_UNIT_TITLE):
if "unit" in str(title.text).lower():
self.course_unit_titles.append(title.text)
logging.debug("course_unit_titles:{}".format(self.course_unit_titles))
logging.info("Course unit titles retrieved")
def get_course_unit_slugs(self):
"""Retrieves course unit slugs"""
self.course_unit_slugs = []
counter = 0
for title in self.course_unit_titles:
self.course_unit_slugs.append(
self.course_title + "/" + str(counter) + "_" + title.replace(" ", "_")
)
counter += 1
logging.debug("course_unit_slugs:{}".format(self.course_unit_slugs))
logging.info("Course unit slugs generated")
def get_course_unit_urls(self):
"""Retrieves course unit urls"""
self.course_unit_urls = []
self.nested_courses = []
for url in self.course_page.find_all(attrs=COURSE_UNIT_TITLE):
if int(url["href"].count("/")) > 2:
self.course_unit_urls.append(url["href"])
else:
self.nested_courses.append(url["href"])
logging.debug("course_unit_urls:{}".format(self.course_unit_urls))
logging.debug("nested_courses:{}".format(self.nested_courses))
logging.info("Course unit urls retrieved")
def get_course_all_slugs(self):
"""Generate slugs for all units"""
unit_lessons_counter = 0
# Unit Page -> Subunit Header + Subunit Block -> Lesson Block -> Lesson Title
for course_unit_url, course_unit_slug, course_unit_title in zip(
self.course_unit_urls, self.course_unit_slugs, self.course_unit_titles
):
unit_lessons_counter = 0
# -> Unit Page
try:
course_unit_page = BeautifulSoup(
requests.get(ROOT_URL + course_unit_url).text, "lxml"
)
except requests.ConnectionError as e:
print("Error Connecting!\n", e)
sys.exit(1)
except requests.exceptions.HTTPError as errh:
print("Http Error:", errh)
sys.exit(1)
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
sys.exit(1)
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
sys.exit(1)
except requests.exceptions.RequestException as err:
print("OOps: Something Else", err)
sys.exit(1)
subunit_couter = 0
subunits = []
# -> Subunit Header -> Subunit Block
for course_subunit_title, course_subunit_body in zip(
course_unit_page.find_all(attrs=COURSE_SUBUNIT_TITLE_ATTRS),
course_unit_page.find_all(
COURSE_SUBUNIT_BODY["tag"], class_=COURSE_SUBUNIT_BODY["class"]
),
):
logging.debug("course_subunit_title:{}".format(course_subunit_title))
lesson_counter = 0
# -> Lesson Block
lessons = []
for course_lesson_body in course_subunit_body.find_all(
COURSE_LESSON_BODY["tag"],
{
"class": [
COURSE_LESSON_BODY["class_i"],
COURSE_LESSON_BODY["class_ii"],
]
},
):
course_lesson_span = course_lesson_body.find_all(
COURSE_LESSON_SPAN["tag"], class_=COURSE_LESSON_SPAN["class"]
)
course_lesson_aria_label = course_lesson_span[0][
COURSE_LESSON_LABEL
]
logging.debug(
"course_lesson_aria_label:{}".format(course_lesson_aria_label)
)
# -> Lesson Title
# Check whether lesson block is a video
if course_lesson_aria_label == "Video":
lesson_title = course_lesson_body.find(
COURSE_LESSON_TITLE["tag"],
class_=COURSE_LESSON_TITLE["class"],
)
logging.debug(
"course_lesson_title:{}".format(lesson_title.text)
)
lessons.append(lesson_title.text.strip())
self.lesson_titles.append(lesson_title.text)
self.course_all_slugs.append(
self.output_rel_path
+ course_unit_slug
+ "/"
+ str(subunit_couter)
+ "_"
+ course_subunit_title.text.replace(" ", "_")
+ "/"
+ str(lesson_counter)
+ "_"
+ lesson_title.text.replace(" ", "_")
)
lesson_counter += 1
unit_lessons_counter += lesson_counter
subunit_couter += 1
subunits.append({
"title": course_subunit_title.text.strip(),
"lessons": lessons
})
self.course_subunits.append({
"title": course_unit_title,
"subunits": subunits
})
self.unit_slugs_counter[course_unit_url] = unit_lessons_counter
logging.info(len(self.course_all_slugs))
logging.info("Course - All slugs generated")
def get_course_youtube_ids(self):
"""Retrieves youtube id per unit"""
#
# with ProgressBar() as pb:
# for i, unit_url in zip(
# pb(range(len(self.course_unit_urls)), label="Collecting Youtube IDs:"),
# self.course_unit_urls,
# ):
# unit_url = ROOT_URL + unit_url
# yt_dlp_opts = {
# "logger": MyLogger(),
# "retries": 20,
# "ignoreerrors:": True,
# "skip_download": True,
# }
# with yt_dlp.YoutubeDL(yt_dlp_opts) as ydl:
# lessons_counter = 0
# try:
# logging.debug(
# "Collecting youtube ids for unit:{}".format(unit_url)
# )
# info_dict = ydl.extract_info(unit_url, download=False)
# for video in info_dict["entries"]:
# video_id = video.get("id", None)
# self.lesson_youtube_ids.append(video_id)
# lessons_counter += 1
# except DownloadError as e:
# logging.debug(
# "Collecting youtube ids for unit:{}".format(unit_url)
# )
# info_dict = ydl.extract_info(
# unit_url, download=False, process=False
# )
# for video in info_dict["entries"]:
# video_id = video.get("url", None)
# self.lesson_youtube_ids.append(video_id)
# lessons_counter += 1
# except Exception as e:
# print("Youtube-dl: An error occured!", e)
# sys.exit(1)
#
# self.unit_ids_counter[unit_url] = lessons_counter
#
# logging.info(self.lesson_youtube_ids)
# logging.info(len(self.lesson_youtube_ids))
# logging.info("Course - Collected Youtube IDs")
def download_course_videos(self):
"""Downloads Course Videos"""
#
# counter = 0
# number_of_videos = len(self.course_all_slugs)
#
# with ProgressBar() as pb:
# for i, lesson_output_file, lesson_video_id in zip(
# pb(range(len(self.lesson_youtube_ids)), label="Downloading Videos:"),
# self.course_all_slugs,
# self.lesson_youtube_ids,
# ):
# lesson_youtube_url = VIDEO_SITE_URL + lesson_video_id
#
# yt_dlp_opts = {
# "logger": MyLogger(),
# "outtmpl": lesson_output_file,
# "retries": 20,
# }
#
# with yt_dlp.YoutubeDL(yt_dlp_opts) as ydl:
# logging.debug(
# "Downloading video[{}] {} of {}:".format(
# lesson_youtube_url, counter, number_of_videos
# )
# )
# try:
# ydl.download([lesson_youtube_url])
# counter += 1
# except DownloadError:
# error_log = open("error_private_videos.txt", "a")
# error_log.write(
# str(
# lesson_output_file
# + ", "
# + VIDEO_SITE_URL
# + lesson_video_id
# )
# )
# error_log.close()
# except Exception as e:
# print("Youtube-dl: An error occured!", e)
# sys.exit(1)
# logging.info(
# "Course lesson video[{}]downloaded".format(lesson_video_id)
# )
# logging.info("All course videos downloaded")
def reset_course(self):
self.domain = ""
self.course_url = ""
self.course_title = ""
self.course_page = ""
self.course_unit_titles = []
self.course_unit_slugs = []
self.course_unit_urls = []
self.course_all_slugs = []
self.lesson_titles = []
self.lesson_youtube_ids = []
self.unit_ids_counter = {}
self.unit_slugs_counter = {}
self.selected_course = ""
self.course_subunits = []
def download_nested_courses(self):
self.reset_course()
if self.nested_courses:
print("\nDownloading nested courses...\n")
for nested_course_url in self.nested_courses:
self.download_course_given(ROOT_URL + nested_course_url)
def download_course_interactive(self):
"""Downloads the chosen course"""
self.domain_prompt()
self.course_prompt()
self.get_course_page()
self.get_course_title()
self.get_course_unit_titles()
self.get_course_unit_slugs()
self.get_course_unit_urls()
print("\nGenerating Path Slugs...\n")
self.get_course_all_slugs()
self.get_course_youtube_ids()
self.download_course_videos()
self.download_nested_courses()
def download_course_given(self, course_url: str):
"""Downloads the given course"""
self.reset_course()
self.course_url = course_url
self.get_course_page()
self.get_course_title()
self.get_course_unit_titles()
self.get_course_unit_slugs()
self.get_course_unit_urls()
self.get_course_all_slugs()
return {
"domain": self.domain,
"url": self.course_url,
"title": self.course_title,
# "page": self.course_page,
"unit_titles": self.course_unit_titles,
# "unit_slugs": self.course_unit_slugs,
# "unit_urls": self.course_unit_urls,
# "all_slugs": self.course_all_slugs,
# "lesson_titles": self.lesson_titles,
"subunits": self.course_subunits
}
print("\nGenerating Path Slugs...\n")
# self.get_course_youtube_ids()
# self.download_course_videos()