sipb · dtemkin1 · Feb 4, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ beautifulsoup4==4.11.1
 lxml==4.9.3
 requests==2.31.0
 tomli>=2.0.1; python_version < "3.11"
+nltk>=3.6.5    # skip yanked version; see https://pypi.org/project/nltk/#history
diff --git a/scrapers/catalog.py b/scrapers/catalog.py
@@ -13,6 +13,32 @@
         "limited": true | false,
     }
 }
+
+Functions:
+* is_not_offered_this_year(html)
+* is_not_offered_next_year(html)
+* is_repeat_allowed(html)
+* get_url(html)
+* has_final(html)
+* get_half(html)
+* is_limited(html)
+* get_course_data(filtered_html)
+* get_home_catalog_links()
+* get_all_catalog_links(initial_hrefs)
+* get_anchors_with_classname(element)
+* scrape_courses_from_page(courses, href)
+* run()
+
+Constants:
+* BASE_URL
+* LIMITED_REGEX
+
+Dependencies:
+* json
+* os.path
+* re
+* requests
+* bs4
 """
 
 import json
@@ -24,6 +50,27 @@
 
 BASE_URL = "http://student.mit.edu/catalog"
 
+# various limited/restricted/etc enrollment phrases in course descriptions
+# PLEASE use regex101.com to test changes before pushing to production!!!
+# text_mining.py also helps by finding test sentences from our entire database
+
+LIMITED_REGEX = re.compile(
+    r"""(?x)
+    [Ee]nrollment[ ](|is[ ]|may[ ]be[ ]|will[ ]be[ ])
+    (limited|restricted|by[ ]application)
+    |([Ll]imited|[Rr]estricted)[ ]
+    (enrollment|by[ ]lottery|number|\d+|to[ ]\d+)
+    |([Ll]imited|[Rr]estricted|([Pp]reference|[Pp]riority)( given| is given)?)
+    [ ]to[ ]([A-Za-z0-9-' ]+)?
+    (
+        students?|freshmen|sophomores|juniors|seniors|majors|minors
+        |concentrators|[Ff]ellows|MBAs?|undergraduates|candidates
+    )
+    |required[ ]prior[ ]to[ ]enrollment
+    |have[ ]priority
+"""
+)
+
 
 def is_not_offered_this_year(html):
     """
@@ -120,7 +167,7 @@ def is_limited(html):
     Returns:
     * bool: True if enrollment in the class is limited
     """
-    if html.find(text=re.compile("[Ll]imited")):
+    if html.find(text=LIMITED_REGEX):
         return True
     return False
 

diff --git a/scrapers/text_mining.py b/scrapers/text_mining.py
@@ -0,0 +1,92 @@
+"""
+Mines hydrant data
+
+Functions:
+* has_keyword(sometext)
+* find_key_sentences(sometext)
+* get_description_list(dataset)
+* get_my_data()
+* find_matching_records(descriptions)
+* run()
+
+Constants:
+* KEYWORDS
+* FOLDER
+* FILEPATHS
+"""
+
+import json
+from nltk.tokenize import word_tokenize, sent_tokenize
+
+KEYWORDS = ["limited", "restricted", "enrollment", "preference", "priority"]
+FOLDER = "../public/"
+FILEPATHS = ["f22.json", "f23.json", "f24.json", "i25.json", "s23.json", "s24.json"]
+
+
+def has_keyword(sometext):
+    """
+    True if sometext contains a keyword, False otherwise
+    """
+    words = word_tokenize(sometext)  # word_tokenize better than the  in operator
+    lowered_words = [w.lower() for w in words]  # make it case insensitive
+    for keyword in KEYWORDS:
+        if keyword in lowered_words:
+            return True
+    return False
+
+
+def find_key_sentences(sometext):
+    """
+    Returns a list of all sentences that contain a keyword
+    """
+    my_sentences = sent_tokenize(sometext)  # sent_tokenize is much better than .split()
+    result = []
+    for sentence in my_sentences:
+        if has_keyword(sentence):
+            result.append(sentence)
+    return result
+
+
+def get_description_list(dataset):
+    """
+    Obtains a list of descriptions from the dataset
+    """
+    classlist = dataset["classes"].values()
+    return [record["description"] for record in classlist]
+
+
+def get_my_data():
+    """
+    obtains the data
+    """
+    descriptions = []
+    for filepath in FILEPATHS:
+        full_path = FOLDER + filepath
+        with open(full_path, "r", encoding="utf-8") as file:
+            rawdata = json.load(file)
+            descriptions.extend(get_description_list(rawdata))
+    return descriptions
+
+
+def find_matching_records(descriptions):
+    """
+    find sentences from record descriptions that contain a keyword
+    """
+    result = []
+    for description in descriptions:
+        result.extend(find_key_sentences(description))
+    return list(sorted(set(result)))
+
+
+def run():
+    """
+    The main function!
+    """
+    mydata = get_my_data()
+    mymatches = find_matching_records(mydata)
+    for match in mymatches:
+        print(match)
+
+
+if __name__ == "__main__":
+    run()