-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Start of finals parser * Continued creating finals parser * Fixed some issues, began commenting * Fixed a few more issues, reconsidered logging * Improved formatting and fixed bug Fixed AM/PM bug, other changes * Converted to python script * Updated with instructions on how to convert to .py * Tested updated schedule * Added logic for cross listed courses * Added cross listing logic * Added requirements.txt * Fixed bug with \ entries * Updated the .py to match * remove idea folder * dont redefine format --------- Co-authored-by: dorian451 <112524240+dorian451@users.noreply.github.com>
- Loading branch information
Showing
14 changed files
with
2,069 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,6 @@ node_modules/ | |
.vscode | ||
.DS_Store | ||
.python-version | ||
*.ipynb* | ||
courses20.xml | ||
.coverage | ||
compose-dev.yaml | ||
|
293 changes: 293 additions & 0 deletions
293
rpi_data/Finals_Parser/.ipynb_checkpoints/FinalsParser-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,293 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "6b64fad0-14d1-4489-937b-857e5ecd8ec2", | ||
"metadata": {}, | ||
"source": [ | ||
"ASSUMPTIONS/DOCUMENTATION:\n", | ||
"\n", | ||
" - A null section value indicates that all sections share that exam date and time.\n", | ||
" - Some ARCH courses list section as \"80\". No clue what this means.\n", | ||
" - The process uses the FinalsBySubject.pdf document from the RPI website.\n", | ||
" - This pdf should have columns Department, Course, Location, Date, and Grades Due (although the first and last don't matter)\n", | ||
" - It should also be titled at the top of each page with Season Year followed by any amount more text (doesn't matter)\n", | ||
" - If the above are not true, small modifications must be made to the process\n", | ||
" - To handle inconsistent AM/PM labeling we assume that all exams begin at or after 8 AM and we assume all exams end at or before 10 PM\n", | ||
" - The current process assumes the finals document is named finals_by_subject.pdf and is in the same folder as this process\n", | ||
" - The output is a csv file with format: ['Season', 'Year', 'Major', 'Course', 'Section', 'Start', 'End', 'Building', 'Room_Number']\n", | ||
" - Use \"jupyter nbconvert --to script FinalsParser.ipynb\" to convert the .ipynb file to a .py\n", | ||
"\n", | ||
"TODO:\n", | ||
"\n", | ||
" - Make grades due column not break the program - can't be fixed without more filled out version of exam schedule\n", | ||
" - It is possible that this column is never filled in the publicly available version - meaning this isn't an issue." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 27, | ||
"id": "c5584e3c-8c4d-4656-8699-383826a60509", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pypdf import PdfReader\n", | ||
"import os\n", | ||
"from datetime import datetime\n", | ||
"import pandas as pd\n", | ||
"import re\n", | ||
"import calendar\n", | ||
"debug_mode = False" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 28, | ||
"id": "b89afdbc-6199-4ad8-bc03-467ea9a495ce", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Construct a dictionary to get the number of a month from it's word\n", | ||
"months = list(calendar.month_name)\n", | ||
"months = [x.lower() for x in months]\n", | ||
"\n", | ||
"# Turns a time, day of month, month, and year into one datetime object for the table\n", | ||
"# Does this for both the start and end time for an exam\n", | ||
"# This is used to get the start and end times for an exam\n", | ||
"def handle_times(start_text, end_text, day, month, year):\n", | ||
" # Regex to get the hour and minute as seperate values from a string of the format HH:MM AM or HH:MM PM\n", | ||
" start_nums = re.findall(r'\\d+', start_text)\n", | ||
" start_nums = [int(x) for x in start_nums]\n", | ||
" end_nums = re.findall(r'\\d+', end_text)\n", | ||
" end_nums = [int(x) for x in end_nums]\n", | ||
" # Instead of trying to track AM/PM we instead use the logic that exams only happen between 8AM - 9:30 PM and convert to military time\n", | ||
" # This is done because RPI likes to have typos such as 8:00 M instead of 8:00 PM making the AM/PM values unreliable\n", | ||
" if end_nums[0] <= 10:\n", | ||
" end_nums[0] += 12\n", | ||
" if start_nums[0] < 8:\n", | ||
" start_nums[0] += 12\n", | ||
" month_num = months.index(month.lower())\n", | ||
" # Construct and return the datetime object\n", | ||
" start_text = year + str(month_num) + day + str(start_nums[0]) + \":\" + str(start_nums[1])\n", | ||
" end_text = year + str(month_num) + day + str(end_nums[0]) + \":\" + str(end_nums[1])\n", | ||
" format = '%Y%m%d%H:%M'\n", | ||
" start_time = datetime.strptime(start_text, format)\n", | ||
" end_time = datetime.strptime(end_text, format)\n", | ||
" return start_time, end_time" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 29, | ||
"id": "201b53c9-4435-4371-8463-cc06ec5a19dc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def parser():\n", | ||
" files = [f for f in os.listdir('.') if os.path.isfile(f)]\n", | ||
" \n", | ||
" reader = PdfReader(\"finals_schedule.pdf\")\n", | ||
" number_of_pages = len(reader.pages)\n", | ||
" \n", | ||
" db_lines = []\n", | ||
"\n", | ||
" # Process the document page by page\n", | ||
" for page in reader.pages:\n", | ||
" text = page.extract_text(extraction_mode=\"layout\")\n", | ||
" # Process the text to handle the following cases:\n", | ||
" # - If the location is \"TBA\" we replace this with \"TBA TBA\" since location is Room RoomNumber\n", | ||
" # - If the location is \"ONLINE\" we replace this with \"ONLINE NA\" for the same reason (Room=ONLINE,RoomNumber=NA) \n", | ||
" # - If there are \"/\" in the text we remove surrounding spaces so they don't cause issues (multiple different issues)\n", | ||
" # - Remove the word \"SECTIONS\" as it's superflous and inconsistent\n", | ||
" text = text.replace(\" / \", \"/\").replace(\"TBA\", \"TBA TBA\").replace(\"ONLINE\", \"ONLINE NA\").replace(\"(ALL \", \"(ALL\").replace(\"SECTIONS \", \"\")\n", | ||
" # Split text into lines, get the season (Fall,Summer,Spring) and year (20__) from the first line, then remove the first three lines since they are header/bank\n", | ||
" text = text.split('\\n')\n", | ||
" for_year = text[0].split(\" \")\n", | ||
" for_year = [x for x in for_year if x != '']\n", | ||
" season = for_year[0]\n", | ||
" year = for_year[1]\n", | ||
" text.pop(0)\n", | ||
" text.pop(0)\n", | ||
" text.pop(0)\n", | ||
" # Remove a fourth line for the first page only since it has the column headers\n", | ||
" if \"DEPARTMENT\" in text[0] and \"COURSE\" in text[0]:\n", | ||
" text.pop(0)\n", | ||
" \n", | ||
" # Now, parse the lines\n", | ||
" for line in text:\n", | ||
" \n", | ||
" # Remove (in SQL syntax) anything like \"(NEEDS%)\" because a few random courses say (NEEDS 6 HR BLOCK) or something along those lines\n", | ||
" while \"(NEEDS\" in line:\n", | ||
" tmp = line[line.index(\"(NEEDS\"):]\n", | ||
" line = line[0:line.index(\"(NEEDS\")] + line[line.index(\"(NEEDS\") + tmp.index(\")\") + 1:]\n", | ||
" \n", | ||
" # Clean up the line and remove department\n", | ||
" line = line.strip()\n", | ||
" line = line.split(\" \")\n", | ||
" line = [x for x in line if x != '']\n", | ||
" line.pop(0)\n", | ||
" \n", | ||
" # Look for the first number in the line - this will be the course code\n", | ||
" first_num = -1\n", | ||
" for i in range(len(line)):\n", | ||
" if any(char.isdigit() for char in line[i]):\n", | ||
" first_num = i\n", | ||
" break\n", | ||
" # Remove everything before the school code (ARCH, CSCI, etc)\n", | ||
" for i in range(first_num - 1):\n", | ||
" line.pop(0)\n", | ||
" # Get major\n", | ||
" major = line[0]\n", | ||
" line.pop(0)\n", | ||
" # Get the course codes\n", | ||
" course_string = line[0]\n", | ||
" courses = []\n", | ||
" # If there are multiple course codes, separate them out\n", | ||
" while \"/\" in course_string:\n", | ||
" i = course_string.index(\"/\")\n", | ||
" courses.append(course_string[0:i])\n", | ||
" course_string = course_string[i + 1:len(course_string)]\n", | ||
" courses.append(course_string)\n", | ||
" line.pop(0)\n", | ||
"\n", | ||
" # Now the line is of the format:\n", | ||
" # [SECTiONS IN VARIOUS FORMATS, BUILDING, ROOM, DAY OF WEEK, MONTH, DAY OF MONTH, '@', START TIME, '-', END TIME, GRADES DUE]\n", | ||
"\n", | ||
" # Start at the end of the line - this is because we don't know how many entries the SECTIONS will be in since doc is formatted inconsistently\n", | ||
"\n", | ||
" # End time\n", | ||
" time2 = line[len(line) - 1]\n", | ||
" line.pop(len(line) - 1)\n", | ||
" line.pop(len(line) - 1)\n", | ||
"\n", | ||
" # Start time\n", | ||
" time1 = line[len(line) - 1]\n", | ||
" line.pop(len(line) - 1)\n", | ||
" line.pop(len(line) - 1)\n", | ||
"\n", | ||
" # Day of month\n", | ||
" day = line[len(line) - 1]\n", | ||
" line.pop(len(line) - 1)\n", | ||
"\n", | ||
" # Month\n", | ||
" month = line[len(line) - 1]\n", | ||
" line.pop(len(line) - 1)\n", | ||
"\n", | ||
" # Day of week\n", | ||
" weekday = line[len(line) - 1].replace(\",\", '')\n", | ||
" line.pop(len(line) - 1)\n", | ||
"\n", | ||
" # Room\n", | ||
" room = line[len(line) - 1]\n", | ||
" line.pop(len(line) - 1)\n", | ||
"\n", | ||
" # Building\n", | ||
" building = line[len(line) - 1]\n", | ||
" line.pop(len(line) - 1)\n", | ||
"\n", | ||
"\n", | ||
" # Split the major up if it is MATH/CSCI for example\n", | ||
" majors = []\n", | ||
" if '/' in major:\n", | ||
" while '/' in major:\n", | ||
" index = major.index('/')\n", | ||
" majors.append(major[:index])\n", | ||
" major = major[index+1:]\n", | ||
" else:\n", | ||
" majors.append(major)\n", | ||
"\n", | ||
" # Everything left is the sections\n", | ||
" # Get the sections from the remainder and fix some formatting (take out of parens and remove commas and ampersands)\n", | ||
" sections = [x.replace(\",\", \"\").replace(\"(\", \"\").replace(\")\", \"\") for x in line if x != ',' and x != '&']\n", | ||
" # If an entry is info for all sections of a class, write that and skip the rest\n", | ||
" all = False\n", | ||
" done = False\n", | ||
" for tmp_major in majors:\n", | ||
" for section in sections:\n", | ||
" if \"ALL\" in section:\n", | ||
" start_time, end_time = handle_times(time1, time2, day, month, year)\n", | ||
" db_lines.append([season, year, tmp_major, course, None, start_time, end_time, building, room])\n", | ||
" all = True\n", | ||
" if all:\n", | ||
" done = True\n", | ||
" continue\n", | ||
" if done:\n", | ||
" continue\n", | ||
" \n", | ||
" fixed_sections = []\n", | ||
" # Create seperate section entries for all sections within a range ([01-05] becomes [01,02,03,04,05])\n", | ||
" for section in sections:\n", | ||
" if '-' in section:\n", | ||
" num1 = int(section[:section.index(\"-\")])\n", | ||
" num2 = int(section[section.index(\"-\") + 1:])\n", | ||
" sections.remove(section)\n", | ||
" for i in range(num1, num2 + 1):\n", | ||
" fixed_sections.append(i)\n", | ||
" else:\n", | ||
" fixed_sections.append(int(section))\n", | ||
" sections = fixed_sections\n", | ||
" # Adds all the entries into the array\n", | ||
" for tmp_major in majors:\n", | ||
" for section in sections:\n", | ||
" for course in courses:\n", | ||
" start_time, end_time = handle_times(time1, time2, day, month, year)\n", | ||
" db_lines.append([season, year, tmp_major, course, int(section), start_time, end_time, building, room])\n", | ||
" return db_lines" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 30, | ||
"id": "c100e052-1d8a-432c-8fc2-0086cfd5d334", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def display_and_write_csv(db_lines):\n", | ||
" # Place into pandas dateframe (not needed but useful for testing & makes writing to csv easier\n", | ||
" df = pd.DataFrame(columns=('Season', 'Year', 'Major', 'Course', 'Section', 'Start', 'End', 'Building', 'Room_Number'))\n", | ||
" for i in range(len(db_lines)):\n", | ||
" df.loc[i] = db_lines[i]\n", | ||
" # standardize datetimes\n", | ||
" df['Start'] = pd.to_datetime(df['Start'])\n", | ||
" df['End'] = pd.to_datetime(df['End']) \n", | ||
" if debug_mode:\n", | ||
" pd.set_option('display.max_rows', 500)\n", | ||
" display(df)\n", | ||
" # write to output csv\n", | ||
" df.to_csv('out.csv')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 31, | ||
"id": "e54bdf1b-ba2d-42dc-afbf-39efdcbc02c0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"db_lines = parser()\n", | ||
"display_and_write_csv(db_lines)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.