Jupyter notebook to scrape stanford's list of courses. Gets the following:
- course title
- course description
- course numbers/ids

In [2]:
from time import sleep

from tqdm import tqdm

MAIN_INDEX_URL = "https://explorecourses.stanford.edu/search?q=all%20courses"

In [3]:
import requests
from bs4 import BeautifulSoup
import re

In [99]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

Scrape all courses.
Change the number of pages based on the footer on https://explorecourses.stanford.edu/search?q=all%20courses

In [103]:
TOTAL_PAGES = 1541

def formatt(text):
    text = text.strip("\r\n\t")
    if text.endswith("more »"):
        text = text[:-6]
    return text.strip("\r\n\t")

all_courses = []

# sadly the api seems to be for students and faculty only

for p in tqdm(range(TOTAL_PAGES), total=TOTAL_PAGES):
    r = requests.get(MAIN_INDEX_URL + f"&page={p}", headers=headers)
    soup = BeautifulSoup(r.content)
    courses = [{
        "number": x.find("span", {"class": 'courseNumber'}).text.rstrip(":"),
        "title": x.find("span", {"class": 'courseTitle'}).text,
        "description": formatt(x.find("div", {"class": 'courseDescription'}).text),
    } for x in soup.find_all("div", {"class": "courseInfo"})]
    all_courses.extend(courses)
    # don't spam their servers too much
    sleep(0.5)

100%|██████████| 1541/1541 [37:09<00:00,  1.45s/it]


Deduplicate courses.
Courses listed multiple times with different ids have the other ids inside brackets

In [127]:
course_ids = set()
unique_courses = []
for course in all_courses:
    # check if we already found a duplicate of this course
    if course["number"] in course_ids:
        continue
    ids = [course["number"]]
    res = re.search(r"\((.*?)\)", course["title"])
    if res:
        ids.extend(res.group(1).split(", "))
        course["title"] = course["title"][:course["title"].rindex("(") - 1]  # strip the course ids from the title "(..."
    course_ids.update(ids)
    unique_courses.append({
        **course,
        "number": ", ".join(ids)
    })

In [140]:
import pandas as pd
df = pd.DataFrame(unique_courses)
df.to_csv("stanford_courses_unique.csv")

Clean descriptions:
- remove urls
- remove course ids
- remove "Continuation of"

In [141]:
cleaned_courses = []
for course in unique_courses:
    desc = course["description"]
    # urls
    desc = re.sub('http[s]?://\S+', '', desc)
    # course names
    desc = re.sub('[A-Z]+ \d+([A-Z]+)?', '', desc)
    cleaned_courses.append({
        **course,
        "description": desc
    })
    

In [4]:
import pandas as pd

In [143]:
df = pd.DataFrame(cleaned_courses)

In [144]:
df.to_csv("stanford_courses_cleaned.csv")

In [5]:
df = pd.read_csv("stanford_courses_cleaned.csv", dtype=str, na_values='', keep_default_na=False)

Some preprocessing to remove generic descriptions

In [6]:
import string

def detect_generic(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    if text in ("tba", "tbd", "description tbd"):
        return False
    for x in ("prerequisite", "continuation of", "graduation", "prior arrangement", "consent of instructor", "doctoral practicum", "may be repeated", "required suprvised", "program consent required", "supervised experience", "students must obtain", "graduate", "research", "tutorial in", "independent study", "for credit", "for advanced"):
        text = text.replace(x, "")
    return len(text) < 20

In [7]:
non_generic_courses = []

for a, b in df.iterrows():
    # no description
    if not isinstance(b["description"], str):
        if len(b["title"]) < 25:  # no description + short title = unusable
            continue
        b["description"] = "TBD"
    if detect_generic(b["description"]):
        continue
    non_generic_courses.append(b)

In [10]:
pd.DataFrame(non_generic_courses).to_csv("stanford_courses_cleaned_non_generic.csv", index=False)