prompts/stanford/1_scraper.ipynb

{ "cells": [ { "cell_type": "markdown", "source": [ "Jupyter notebook to scrape stanford's list of courses. Gets the following:\n", "- course title\n", "- course description\n", "- course numbers/ids" ], "metadata": { "collapsed": false }, "id": "aeed1babdd1ced9b" }, { "cell_type": "code", "execution_count": 2, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2023-09-27T10:58:34.215404466Z", "start_time": "2023-09-27T10:58:34.208962606Z" } }, "outputs": [], "source": [ "from time import sleep\n", "\n", "from tqdm import tqdm\n", "\n", "MAIN_INDEX_URL = \"https://explorecourses.stanford.edu/search?q=all%20courses\"" ] }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import re" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-27T10:58:34.720087515Z", "start_time": "2023-09-27T10:58:34.640417536Z" } }, "id": "b4ca6254d01f8dc8" }, { "cell_type": "code", "execution_count": 99, "outputs": [], "source": [ "headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-25T10:03:23.322727830Z", "start_time": "2023-09-25T10:03:23.319789021Z" } }, "id": "cd55f81c3d1d42dd" }, { "cell_type": "markdown", "source": [ "Scrape all courses.\n", "Change the number of pages based on the footer on https://explorecourses.stanford.edu/search?q=all%20courses" ], "metadata": { "collapsed": false }, "id": "bbce7731b5ff3ad2" }, { "cell_type": "code", "execution_count": 103, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1541/1541 [37:09<00:00, 1.45s/it]\n" ] } ], "source": [ "TOTAL_PAGES = 1541\n", "\n", "def formatt(text):\n", " text = text.strip(\"\\r\\n\\t\")\n", " if text.endswith(\"more »\"):\n", " text = text[:-6]\n", " return text.strip(\"\\r\\n\\t\")\n", "\n", "all_courses = []\n", "\n", "# sadly the api seems to be for students and faculty only\n", "\n", "for p in tqdm(range(TOTAL_PAGES), total=TOTAL_PAGES):\n", " r = requests.get(MAIN_INDEX_URL + f\"&page={p}\", headers=headers)\n", " soup = BeautifulSoup(r.content)\n", " courses = [{\n", " \"number\": x.find(\"span\", {\"class\": 'courseNumber'}).text.rstrip(\":\"),\n", " \"title\": x.find(\"span\", {\"class\": 'courseTitle'}).text,\n", " \"description\": formatt(x.find(\"div\", {\"class\": 'courseDescription'}).text),\n", " } for x in soup.find_all(\"div\", {\"class\": \"courseInfo\"})]\n", " all_courses.extend(courses)\n", " # don't spam their servers too much\n", " sleep(0.5)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-25T10:41:21.425942204Z", "start_time": "2023-09-25T10:04:11.761537359Z" } }, "id": "4523f23672936f0a" }, { "cell_type": "markdown", "source": [ "Deduplicate courses.\n", "Courses listed multiple times with different ids have the other ids inside brackets" ], "metadata": { "collapsed": false }, "id": "b140124d216e49b3" }, { "cell_type": "code", "execution_count": 127, "outputs": [], "source": [ "course_ids = set()\n", "unique_courses = []\n", "for course in all_courses:\n", " # check if we already found a duplicate of this course\n", " if course[\"number\"] in course_ids:\n", " continue\n", " ids = [course[\"number\"]]\n", " res = re.search(r\"\\((.*?)\\)\", course[\"title\"])\n", " if res:\n", " ids.extend(res.group(1).split(\", \"))\n", " course[\"title\"] = course[\"title\"][:course[\"title\"].rindex(\"(\") - 1] # strip the course ids from the title \"(...\"\n", " course_ids.update(ids)\n", " unique_courses.append({\n", " **course,\n", " \"number\": \", \".join(ids)\n", " })" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-25T11:13:39.243448406Z", "start_time": "2023-09-25T11:13:39.189180954Z" } }, "id": "dc18a648c4fa5d50" }, { "cell_type": "code", "execution_count": 140, "outputs": [], "source": [ "import pandas as pd\n", "df = pd.DataFrame(unique_courses)\n", "df.to_csv(\"stanford_courses_unique.csv\")" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-25T11:46:00.367181772Z", "start_time": "2023-09-25T11:46:00.283586989Z" } }, "id": "e8e4338438a8e5ae" }, { "cell_type": "markdown", "source": [ "Clean descriptions:\n", "- remove urls\n", "- remove course ids\n", "- remove \"Continuation of\"" ], "metadata": { "collapsed": false }, "id": "78788009ded5f401" }, { "cell_type": "code", "execution_count": 141, "outputs": [], "source": [ "cleaned_courses = []\n", "for course in unique_courses:\n", " desc = course[\"description\"]\n", " # urls\n", " desc = re.sub('http[s]?://\\S+', '', desc)\n", " # course names\n", " desc = re.sub('[A-Z]+ \\d+([A-Z]+)?', '', desc)\n", " cleaned_courses.append({\n", " **course,\n", " \"description\": desc\n", " })\n", " " ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-25T12:20:44.838325706Z", "start_time": "2023-09-25T12:20:44.761095868Z" } }, "id": "a59836f513898f95" }, { "cell_type": "code", "execution_count": 4, "outputs": [], "source": [ "import pandas as pd" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-27T10:58:39.102293877Z", "start_time": "2023-09-27T10:58:38.900613044Z" } }, "id": "c2524526f5e1bdf4" }, { "cell_type": "code", "execution_count": 143, "outputs": [], "source": [ "df = pd.DataFrame(cleaned_courses)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-25T12:20:45.440192733Z", "start_time": "2023-09-25T12:20:45.438743636Z" } }, "id": "ed912eb58e38574c" }, { "cell_type": "code", "execution_count": 144, "outputs": [], "source": [ "df.to_csv(\"stanford_courses_cleaned.csv\")" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-25T12:20:45.777534972Z", "start_time": "2023-09-25T12:20:45.702459885Z" } }, "id": "4d21646ef7ff9593" }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ "df = pd.read_csv(\"stanford_courses_cleaned.csv\", dtype=str, na_values='', keep_default_na=False)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-27T10:58:44.461354916Z", "start_time": "2023-09-27T10:58:44.414283703Z" } }, "id": "303ce35ad306db4b" }, { "cell_type": "markdown", "source": [ "Some preprocessing to remove generic descriptions" ], "metadata": { "collapsed": false }, "id": "8ae1b01a983d663f" }, { "cell_type": "code", "execution_count": 6, "outputs": [], "source": [ "import string\n", "\n", "def detect_generic(text):\n", " text = text.lower().translate(str.maketrans('', '', string.punctuation))\n", " if text in (\"tba\", \"tbd\", \"description tbd\"):\n", " return False\n", " for x in (\"prerequisite\", \"continuation of\", \"graduation\", \"prior arrangement\", \"consent of instructor\", \"doctoral practicum\", \"may be repeated\", \"required suprvised\", \"program consent required\", \"supervised experience\", \"students must obtain\", \"graduate\", \"research\", \"tutorial in\", \"independent study\", \"for credit\", \"for advanced\"):\n", " text = text.replace(x, \"\")\n", " return len(text) < 20" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-27T10:58:45.230915973Z", "start_time": "2023-09-27T10:58:45.223450162Z" } }, "id": "ced2e95857c5d12b" }, { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "non_generic_courses = []\n", "\n", "for a, b in df.iterrows():\n", " # no description\n", " if not isinstance(b[\"description\"], str):\n", " if len(b[\"title\"]) < 25: # no description + short title = unusable\n", " continue\n", " b[\"description\"] = \"TBD\"\n", " if detect_generic(b[\"description\"]):\n", " continue\n", " non_generic_courses.append(b)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-27T10:58:46.428860754Z", "start_time": "2023-09-27T10:58:45.871251538Z" } }, "id": "8dbb9403c43e07a9" }, { "cell_type": "code", "execution_count": 10, "outputs": [], "source": [ "pd.DataFrame(non_generic_courses).to_csv(\"stanford_courses_cleaned_non_generic.csv\", index=False)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-09-27T11:08:23.018882075Z", "start_time": "2023-09-27T11:08:20.205486903Z" } }, "id": "5a321cbac67d727f" }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false }, "id": "b122079355a393a0" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }

prompts/stanford/1_scraper.ipynb (399 lines of code) (raw):