Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use sophisticated regex to detect limited enrollment #143

Merged
merged 10 commits into from
Feb 4, 2025
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ beautifulsoup4==4.11.1
lxml==4.9.3
requests==2.31.0
tomli>=2.0.1; python_version < "3.11"
nltk>=3.6.5 # skip yanked version; see https://pypi.org/project/nltk/#history
49 changes: 48 additions & 1 deletion scrapers/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,32 @@
"limited": true | false,
}
}

Functions:
* is_not_offered_this_year(html)
* is_not_offered_next_year(html)
* is_repeat_allowed(html)
* get_url(html)
* has_final(html)
* get_half(html)
* is_limited(html)
* get_course_data(filtered_html)
* get_home_catalog_links()
* get_all_catalog_links(initial_hrefs)
* get_anchors_with_classname(element)
* scrape_courses_from_page(courses, href)
* run()

Constants:
* BASE_URL
* LIMITED_REGEX

Dependencies:
* json
* os.path
* re
* requests
* bs4
"""

import json
Expand All @@ -24,6 +50,27 @@

BASE_URL = "http://student.mit.edu/catalog"

# various limited/restricted/etc enrollment phrases in course descriptions
# PLEASE use regex101.com to test changes before pushing to production!!!
# text_mining.py also helps by finding test sentences from our entire database

LIMITED_REGEX = re.compile(
r"""(?x)
[Ee]nrollment[ ](|is[ ]|may[ ]be[ ]|will[ ]be[ ])
(limited|restricted|by[ ]application)
|([Ll]imited|[Rr]estricted)[ ]
(enrollment|by[ ]lottery|number|\d+|to[ ]\d+)
|([Ll]imited|[Rr]estricted|([Pp]reference|[Pp]riority)( given| is given)?)
[ ]to[ ]([A-Za-z0-9-' ]+)?
(
students?|freshmen|sophomores|juniors|seniors|majors|minors
|concentrators|[Ff]ellows|MBAs?|undergraduates|candidates
)
|required[ ]prior[ ]to[ ]enrollment
|have[ ]priority
"""
)


def is_not_offered_this_year(html):
"""
Expand Down Expand Up @@ -120,7 +167,7 @@ def is_limited(html):
Returns:
* bool: True if enrollment in the class is limited
"""
if html.find(text=re.compile("[Ll]imited")):
if html.find(text=LIMITED_REGEX):
return True
return False

Expand Down
92 changes: 92 additions & 0 deletions scrapers/text_mining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
Mines hydrant data

Functions:
* has_keyword(sometext)
* find_key_sentences(sometext)
* get_description_list(dataset)
* get_my_data()
* find_matching_records(descriptions)
* run()

Constants:
* KEYWORDS
* FOLDER
* FILEPATHS
"""

import json
from nltk.tokenize import word_tokenize, sent_tokenize

KEYWORDS = ["limited", "restricted", "enrollment", "preference", "priority"]
FOLDER = "../public/"
FILEPATHS = ["f22.json", "f23.json", "f24.json", "i25.json", "s23.json", "s24.json"]


def has_keyword(sometext):
"""
True if sometext contains a keyword, False otherwise
"""
words = word_tokenize(sometext) # word_tokenize better than the in operator
lowered_words = [w.lower() for w in words] # make it case insensitive
for keyword in KEYWORDS:
if keyword in lowered_words:
return True
return False


def find_key_sentences(sometext):
"""
Returns a list of all sentences that contain a keyword
"""
my_sentences = sent_tokenize(sometext) # sent_tokenize is much better than .split()
result = []
for sentence in my_sentences:
if has_keyword(sentence):
result.append(sentence)
return result


def get_description_list(dataset):
"""
Obtains a list of descriptions from the dataset
"""
classlist = dataset["classes"].values()
return [record["description"] for record in classlist]


def get_my_data():
"""
obtains the data
"""
descriptions = []
for filepath in FILEPATHS:
full_path = FOLDER + filepath
with open(full_path, "r", encoding="utf-8") as file:
rawdata = json.load(file)
descriptions.extend(get_description_list(rawdata))
return descriptions


def find_matching_records(descriptions):
"""
find sentences from record descriptions that contain a keyword
"""
result = []
for description in descriptions:
result.extend(find_key_sentences(description))
return list(sorted(set(result)))


def run():
"""
The main function!
"""
mydata = get_my_data()
mymatches = find_matching_records(mydata)
for match in mymatches:
print(match)


if __name__ == "__main__":
run()