Finals Parser (#890)

* Start of finals parser * Continued creating finals parser * Fixed some issues, began commenting * Fixed a few more issues, reconsidered logging * Improved formatting and fixed bug Fixed AM/PM bug, other changes * Converted to python script * Updated with instructions on how to convert to .py * Tested updated schedule * Added logic for cross listed courses * Added cross listing logic * Added requirements.txt * Fixed bug with \ entries * Updated the .py to match * remove idea folder * dont redefine format --------- Co-authored-by: dorian451 <112524240+dorian451@users.noreply.github.com>
YACS-RCOS · Nov 1, 2024 · ffaac6f · ffaac6f
1 parent 9ea3e4c
commit ffaac6f
Show file tree

Hide file tree

Showing 14 changed files with 2,069 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,6 @@ node_modules/
 .vscode
 .DS_Store
 .python-version
-*.ipynb*
 courses20.xml
 .coverage
 compose-dev.yaml

diff --git a/rpi_data/Finals_Parser/.ipynb_checkpoints/FinalsParser-checkpoint.ipynb b/rpi_data/Finals_Parser/.ipynb_checkpoints/FinalsParser-checkpoint.ipynb
@@ -0,0 +1,293 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6b64fad0-14d1-4489-937b-857e5ecd8ec2",
+   "metadata": {},
+   "source": [
+    "ASSUMPTIONS/DOCUMENTATION:\n",
+    "\n",
+    "    - A null section value indicates that all sections share that exam date and time.\n",
+    "    - Some ARCH courses list section as \"80\". No clue what this means.\n",
+    "    - The process uses the FinalsBySubject.pdf document from the RPI website.\n",
+    "        - This pdf should have columns Department, Course, Location, Date, and Grades Due (although the first and last don't matter)\n",
+    "        - It should also be titled at the top of each page with Season Year followed by any amount more text (doesn't matter)\n",
+    "        - If the above are not true, small modifications must be made to the process\n",
+    "    - To handle inconsistent AM/PM labeling we assume that all exams begin at or after 8 AM and we assume all exams end at or before 10 PM\n",
+    "    - The current process assumes the finals document is named finals_by_subject.pdf and is in the same folder as this process\n",
+    "    - The output is a csv file with format: ['Season', 'Year', 'Major', 'Course', 'Section', 'Start', 'End', 'Building', 'Room_Number']\n",
+    "    - Use \"jupyter nbconvert --to script FinalsParser.ipynb\" to convert the .ipynb file to a .py\n",
+    "\n",
+    "TODO:\n",
+    "\n",
+    "    - Make grades due column not break the program - can't be fixed without more filled out version of exam schedule\n",
+    "        - It is possible that this column is never filled in the publicly available version - meaning this isn't an issue."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "c5584e3c-8c4d-4656-8699-383826a60509",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pypdf import PdfReader\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import calendar\n",
+    "debug_mode = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "b89afdbc-6199-4ad8-bc03-467ea9a495ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Construct a dictionary to get the number of a month from it's word\n",
+    "months = list(calendar.month_name)\n",
+    "months = [x.lower() for x in months]\n",
+    "\n",
+    "# Turns a time, day of month, month, and year into one datetime object for the table\n",
+    "# Does this for both the start and end time for an exam\n",
+    "# This is used to get the start and end times for an exam\n",
+    "def handle_times(start_text, end_text, day, month, year):\n",
+    "    # Regex to get the hour and minute as seperate values from a string of the format HH:MM AM or HH:MM PM\n",
+    "    start_nums = re.findall(r'\\d+', start_text)\n",
+    "    start_nums = [int(x) for x in start_nums]\n",
+    "    end_nums = re.findall(r'\\d+', end_text)\n",
+    "    end_nums = [int(x) for x in end_nums]\n",
+    "    # Instead of trying to track AM/PM we instead use the logic that exams only happen between 8AM - 9:30 PM and convert to military time\n",
+    "    # This is done because RPI likes to have typos such as 8:00 M instead of 8:00 PM making the AM/PM values unreliable\n",
+    "    if end_nums[0] <= 10:\n",
+    "        end_nums[0] += 12\n",
+    "    if start_nums[0] < 8:\n",
+    "        start_nums[0] += 12\n",
+    "    month_num = months.index(month.lower())\n",
+    "    # Construct and return the datetime object\n",
+    "    start_text = year + str(month_num) + day + str(start_nums[0]) + \":\" + str(start_nums[1])\n",
+    "    end_text = year + str(month_num) + day + str(end_nums[0]) + \":\" + str(end_nums[1])\n",
+    "    format = '%Y%m%d%H:%M'\n",
+    "    start_time = datetime.strptime(start_text, format)\n",
+    "    end_time = datetime.strptime(end_text, format)\n",
+    "    return start_time, end_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "201b53c9-4435-4371-8463-cc06ec5a19dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parser():\n",
+    "    files = [f for f in os.listdir('.') if os.path.isfile(f)]\n",
+    "    \n",
+    "    reader = PdfReader(\"finals_schedule.pdf\")\n",
+    "    number_of_pages = len(reader.pages)\n",
+    "    \n",
+    "    db_lines = []\n",
+    "\n",
+    "    # Process the document page by page\n",
+    "    for page in reader.pages:\n",
+    "        text = page.extract_text(extraction_mode=\"layout\")\n",
+    "        # Process the text to handle the following cases:\n",
+    "        #   - If the location is \"TBA\" we replace this with \"TBA TBA\" since location is Room RoomNumber\n",
+    "        #   - If the location is \"ONLINE\" we replace this with \"ONLINE NA\" for the same reason (Room=ONLINE,RoomNumber=NA) \n",
+    "        #   - If there are \"/\" in the text we remove surrounding spaces so they don't cause issues (multiple different issues)\n",
+    "        #   - Remove the word \"SECTIONS\" as it's superflous and inconsistent\n",
+    "        text = text.replace(\" / \", \"/\").replace(\"TBA\", \"TBA TBA\").replace(\"ONLINE\", \"ONLINE NA\").replace(\"(ALL \", \"(ALL\").replace(\"SECTIONS \", \"\")\n",
+    "        # Split text into lines, get the season (Fall,Summer,Spring) and year (20__) from the first line, then remove the first three lines since they are header/bank\n",
+    "        text = text.split('\\n')\n",
+    "        for_year = text[0].split(\" \")\n",
+    "        for_year = [x for x in for_year if x != '']\n",
+    "        season = for_year[0]\n",
+    "        year = for_year[1]\n",
+    "        text.pop(0)\n",
+    "        text.pop(0)\n",
+    "        text.pop(0)\n",
+    "        # Remove a fourth line for the first page only since it has the column headers\n",
+    "        if \"DEPARTMENT\" in text[0] and \"COURSE\" in text[0]:\n",
+    "            text.pop(0)\n",
+    "    \n",
+    "        # Now, parse the lines\n",
+    "        for line in text:\n",
+    "    \n",
+    "            # Remove (in SQL syntax) anything like \"(NEEDS%)\" because a few random courses say (NEEDS 6 HR BLOCK) or something along those lines\n",
+    "            while \"(NEEDS\" in line:\n",
+    "                tmp = line[line.index(\"(NEEDS\"):]\n",
+    "                line = line[0:line.index(\"(NEEDS\")] + line[line.index(\"(NEEDS\") + tmp.index(\")\") + 1:]\n",
+    "    \n",
+    "            # Clean up the line and remove department\n",
+    "            line = line.strip()\n",
+    "            line = line.split(\" \")\n",
+    "            line = [x for x in line if x != '']\n",
+    "            line.pop(0)\n",
+    "    \n",
+    "            # Look for the first number in the line - this will be the course code\n",
+    "            first_num = -1\n",
+    "            for i in range(len(line)):\n",
+    "                if any(char.isdigit() for char in line[i]):\n",
+    "                    first_num = i\n",
+    "                    break\n",
+    "            # Remove everything before the school code (ARCH, CSCI, etc)\n",
+    "            for i in range(first_num - 1):\n",
+    "                line.pop(0)\n",
+    "            # Get major\n",
+    "            major = line[0]\n",
+    "            line.pop(0)\n",
+    "            # Get the course codes\n",
+    "            course_string = line[0]\n",
+    "            courses = []\n",
+    "            # If there are multiple course codes, separate them out\n",
+    "            while \"/\" in course_string:\n",
+    "                i = course_string.index(\"/\")\n",
+    "                courses.append(course_string[0:i])\n",
+    "                course_string = course_string[i + 1:len(course_string)]\n",
+    "            courses.append(course_string)\n",
+    "            line.pop(0)\n",
+    "\n",
+    "            # Now the line is of the format:\n",
+    "            # [SECTiONS IN VARIOUS FORMATS, BUILDING, ROOM, DAY OF WEEK, MONTH, DAY OF MONTH, '@', START TIME, '-', END TIME, GRADES DUE]\n",
+    "\n",
+    "            # Start at the end of the line - this is because we don't know how many entries the SECTIONS will be in since doc is formatted inconsistently\n",
+    "\n",
+    "            # End time\n",
+    "            time2 = line[len(line) - 1]\n",
+    "            line.pop(len(line) - 1)\n",
+    "            line.pop(len(line) - 1)\n",
+    "\n",
+    "            # Start time\n",
+    "            time1 = line[len(line) - 1]\n",
+    "            line.pop(len(line) - 1)\n",
+    "            line.pop(len(line) - 1)\n",
+    "\n",
+    "            # Day of month\n",
+    "            day = line[len(line) - 1]\n",
+    "            line.pop(len(line) - 1)\n",
+    "\n",
+    "            # Month\n",
+    "            month = line[len(line) - 1]\n",
+    "            line.pop(len(line) - 1)\n",
+    "\n",
+    "            # Day of week\n",
+    "            weekday = line[len(line) - 1].replace(\",\", '')\n",
+    "            line.pop(len(line) - 1)\n",
+    "\n",
+    "            # Room\n",
+    "            room = line[len(line) - 1]\n",
+    "            line.pop(len(line) - 1)\n",
+    "\n",
+    "            # Building\n",
+    "            building = line[len(line) - 1]\n",
+    "            line.pop(len(line) - 1)\n",
+    "\n",
+    "\n",
+    "            # Split the major up if it is MATH/CSCI for example\n",
+    "            majors = []\n",
+    "            if '/' in major:\n",
+    "                while '/' in major:\n",
+    "                    index = major.index('/')\n",
+    "                    majors.append(major[:index])\n",
+    "                    major = major[index+1:]\n",
+    "            else:\n",
+    "                majors.append(major)\n",
+    "\n",
+    "            # Everything left is the sections\n",
+    "            # Get the sections from the remainder and fix some formatting (take out of parens and remove commas and ampersands)\n",
+    "            sections = [x.replace(\",\", \"\").replace(\"(\", \"\").replace(\")\", \"\") for x in line if x != ',' and x != '&']\n",
+    "            # If an entry is info for all sections of a class, write that and skip the rest\n",
+    "            all = False\n",
+    "            done = False\n",
+    "            for tmp_major in majors:\n",
+    "                for section in sections:\n",
+    "                    if \"ALL\" in section:\n",
+    "                        start_time, end_time = handle_times(time1, time2, day, month, year)\n",
+    "                        db_lines.append([season, year, tmp_major, course, None, start_time, end_time, building, room])\n",
+    "                        all = True\n",
+    "                if all:\n",
+    "                    done = True\n",
+    "                    continue\n",
+    "            if done:\n",
+    "                continue\n",
+    "            \n",
+    "            fixed_sections = []\n",
+    "            # Create seperate section entries for all sections within a range ([01-05] becomes [01,02,03,04,05])\n",
+    "            for section in sections:\n",
+    "                if '-' in section:\n",
+    "                    num1 = int(section[:section.index(\"-\")])\n",
+    "                    num2 = int(section[section.index(\"-\") + 1:])\n",
+    "                    sections.remove(section)\n",
+    "                    for i in range(num1, num2 + 1):\n",
+    "                        fixed_sections.append(i)\n",
+    "                else:\n",
+    "                    fixed_sections.append(int(section))\n",
+    "            sections = fixed_sections\n",
+    "            # Adds all the entries into the array\n",
+    "            for tmp_major in majors:\n",
+    "                for section in sections:\n",
+    "                    for course in courses:\n",
+    "                        start_time, end_time = handle_times(time1, time2, day, month, year)\n",
+    "                        db_lines.append([season, year, tmp_major, course, int(section), start_time, end_time, building, room])\n",
+    "    return db_lines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "c100e052-1d8a-432c-8fc2-0086cfd5d334",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def display_and_write_csv(db_lines):\n",
+    "    # Place into pandas dateframe (not needed but useful for testing & makes writing to csv easier\n",
+    "    df = pd.DataFrame(columns=('Season', 'Year', 'Major', 'Course', 'Section', 'Start', 'End', 'Building', 'Room_Number'))\n",
+    "    for i in range(len(db_lines)):\n",
+    "        df.loc[i] = db_lines[i]\n",
+    "    # standardize datetimes\n",
+    "    df['Start'] = pd.to_datetime(df['Start'])\n",
+    "    df['End'] = pd.to_datetime(df['End'])    \n",
+    "    if debug_mode:\n",
+    "        pd.set_option('display.max_rows', 500)\n",
+    "        display(df)\n",
+    "    # write to output csv\n",
+    "    df.to_csv('out.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "e54bdf1b-ba2d-42dc-afbf-39efdcbc02c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db_lines = parser()\n",
+    "display_and_write_csv(db_lines)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}