No login parse (#885)

* Wrote functions to find all course codes and to generate links based off those codes. * Made a lot of progress but forgot to commit. * end of session commit * Wrote some formatting for the scraped data. * First draft of no login scraper works (mostly) * Started work on a script to scrape Professor Goldschmidt's course information files. * Saving the work for today. Started work on a sctipt to scrape professor data to our json format. * Got all of the professor links. * Committing my work from Friday. Finished scraping all professor data, just have to output as json. * Finished up the faculty scrape, finished up Goldschmidt parse. * Commit to work off on desktop * Changed file locations and started catalog parser. * Pulling over some work I did on HASSPathways over to YACS, with a lot of large edits and additions to make it work with my previous code and to maximize performance. * Commented my scraper functions. * oops.
YACS-RCOS · Nov 5, 2024 · 5153051 · 5153051
1 parent ffaac6f
commit 5153051
Show file tree

Hide file tree

Showing 28 changed files with 3,796 additions and 1,758 deletions.
diff --git a/rpi_data/Professors.json b/rpi_data/Professors.json
diff --git a/rpi_data/fall-2024.csv b/rpi_data/fall-2024.csv
diff --git a/rpi_data/modules/ci_scraper.py b/rpi_data/modules/ci_scraper.py
@@ -0,0 +1,54 @@
+from bs4 import BeautifulSoup as bs
+import requests
+from pypdf import PdfReader
+import os
+
+'''
+Scrapes a Communication Intensive PDF as of August 2024
+
+by Giancarlo Martinelli (discord: gcm)
+'''
+
+'''
+Checks if a string is a number (I just wanted something that returned a boolean)
+'''
+def is_number(s: str) -> bool:
+    try:
+        num = int(s)
+        return True 
+    except:
+        return False
+
+'''
+Scrapes an individual page for all of its relevant course codes
+'''
+def parse_page(page_text: str) -> list[str]:
+    lines = page_text.split("\n")
+    result = []
+    for line in lines:
+        words = line.split(" ")
+        if len(words[0]) == 5 and is_number(words[0]):
+            result.append(words[1].rsplit("-", 1)[0])
+    return result
+
+'''
+Main function, reads a pdf's text and then individually scrapes each page
+'''
+def parse_pdf(pdf_path: str) -> set[str]:
+    pdf = PdfReader(pdf_path)
+    cis = set()
+    num_pages = len(pdf.pages)
+    for i in range(num_pages):
+        page = pdf.pages[i]
+        text = page.extract_text()
+        parsed = parse_page(text)
+        [cis.add(i) for i in parsed]
+    return cis
+
+'''
+For testing
+'''
+if __name__ == "__main__":
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    pdf_path = os.path.join(dir_path, 'pdfs', 'fall2024-ci.pdf')
+    parse_pdf(pdf_path)