Skip to content

Commit

Permalink
Merge pull request #1855 from He3lixxx/import-improvements
Browse files Browse the repository at this point in the history
Importer improvements: Fix #1574
  • Loading branch information
richardebeling authored Feb 20, 2023
2 parents 2fd6957 + 5eb5086 commit 298c9b8
Show file tree
Hide file tree
Showing 10 changed files with 300 additions and 156 deletions.
3 changes: 0 additions & 3 deletions evap/evaluation/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,6 @@ class Meta:
"delegates": UserModelMultipleChoiceField,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def save(self, *args, **kw):
super().save(*args, **kw)
logger.info('User "%s" edited the settings.', self.instance.email)
3 changes: 3 additions & 0 deletions evap/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@
# the importer will warn if any participant has more enrollments than this number
IMPORTER_MAX_ENROLLMENTS = 7

# Cutoff value passed to difflib.get_close_matches() to find typos in course names. Lower values are slower.
IMPORTER_COURSE_NAME_SIMILARITY_WARNING_THRESHOLD = 0.9

# the default descriptions for grade documents
DEFAULT_FINAL_GRADES_DESCRIPTION_EN = "Final grades"
DEFAULT_MIDTERM_GRADES_DESCRIPTION_EN = "Midterm grades"
Expand Down
4 changes: 2 additions & 2 deletions evap/staff/fixtures/excel_files_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kaufen', 'Buy', 'Dr.', 'Romano', 'Electram', '111@external.example.com'],
['Master', 'Synephebos', 'Diam', '', 'Seminar', 'no', 'Zerplatzen', 'Burst', 'Dr.', 'Sadipscing', 'Elitr', '234@external.example.com'],
['Diploma', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Vorlesung', 'yes', 'Schneiden', 'Cut', 'Dr.', 'Sic', 'Graecis', '890@external.example.com'],
['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kommen', 'Come', 'Prof. Dr.', 'Takimata', 'Labore', '678@internal.example.com'],
['Bachelor', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kommen', 'Come', 'Prof. Dr.', 'Takimata', 'Labore', '678@internal.example.com'],
['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kosten', 'Cost', 'Dr.', 'Aliquyam', 'Sanctus', ''],
['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Praktikum', 'no', 'Wählen', 'Choose', 'Prof. Dr.', 'Dolor', 'Sit', 'asd@external.example.com'],
['Master', 'Lorem', 'Ipsum', 'ipsum.lorem@institution.example.com', 'Vorlesung', 'no', 'Schlagen', 'Beat', 'Prof. Dr.', 'Amet', 'Consetetur', '123@external.example.com'],
Expand Down Expand Up @@ -153,7 +153,7 @@
'MA Belegungen': [
['Degree', 'Student last name', 'Student first name', 'Student email address', 'Course kind', 'Course is graded', 'Course name (de)', 'Course name (en)', 'Responsible title', 'Responsible last name', 'Responsible first name', 'Responsible email address'],
['Grandmaster', 'Quid', 'Bastius', 'bastius.quid@external.example.com', 'jaminar', 'probably not', 'Bauen', 'Build', '', 'Sed', 'Diam', '345@external.example.com'],
['Beginner,Bachelor', 'Lorem', 'Ipsum', 'ipsum.lorem@institution.example.com', 'jaminar', 'probably not', 'Bauen', 'Build', '', 'Sed', 'Diam', '345@external.example.com'],
['Beginner', 'Lorem', 'Ipsum', 'ipsum.lorem@institution.example.com', 'jaminar', 'probably not', 'Bauen', 'Build', '', 'Sed', 'Diam', '345@external.example.com'],
],
}

Expand Down
7 changes: 6 additions & 1 deletion evap/staff/importers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ class Category(Enum):
"too_many_enrollments", gettext_lazy("Unusually high number of enrollments"), 14
)

SIMILAR_COURSE_NAMES = _CATEGORY_TUPLE("similar_course_names", gettext_lazy("Similar course names"), 15)

level: Level
category: Category
message: str
Expand Down Expand Up @@ -136,7 +138,7 @@ class ConvertExceptionsToMessages:
"""Shared catch-all exception handlers between importers"""

def __init__(self, importer_log: ImporterLog):
self.importer_log: ImporterLog = importer_log
self.importer_log = importer_log

def __enter__(self):
pass
Expand Down Expand Up @@ -274,6 +276,9 @@ def aggregated_keys_and_location_strings(self) -> Iterator[Tuple[Any, str]]:

yield key, location_string

def keys(self) -> Iterable[Any]:
return self.first_location_by_key.keys()


class Checker:
def __init__(self, test_run: bool, importer_log: ImporterLog):
Expand Down
151 changes: 104 additions & 47 deletions evap/staff/importers/enrollment.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import difflib
from collections import defaultdict
from dataclasses import dataclass, fields
from datetime import date, datetime
Expand Down Expand Up @@ -34,7 +35,7 @@
)


@dataclass
@dataclass(frozen=True)
class InvalidValue:
# We make this a dataclass to make sure all instances compare equal.
pass
Expand Down Expand Up @@ -83,9 +84,9 @@ def all_fields_valid(course_data: CourseData) -> TypeGuard[ValidCourseData]:


class DegreeImportMapper:
class InvalidDegreeNamesException(Exception):
def __init__(self, *args, invalid_degree_names: Set[str], **kwargs):
self.invalid_degree_names = invalid_degree_names
class InvalidDegreeNameException(Exception):
def __init__(self, *args, invalid_degree_name: str, **kwargs):
self.invalid_degree_name = invalid_degree_name
super().__init__(*args, **kwargs)

def __init__(self) -> None:
Expand All @@ -95,20 +96,13 @@ def __init__(self) -> None:
for import_name in degree.import_names
}

def degree_set_from_import_string(self, import_string: str) -> Set[Degree]:
result = set()
invalid_degree_names = set()
for degree_name in import_string.split(","):
stripped_name = degree_name.strip()
try:
result.add(self.degrees[stripped_name.lower()])
except KeyError:
invalid_degree_names.add(stripped_name)

if invalid_degree_names:
raise self.InvalidDegreeNamesException(invalid_degree_names=invalid_degree_names)

return result
def degree_from_import_string(self, import_string: str) -> Degree:
trimmed_name = import_string.strip()
lookup_key = trimmed_name.lower()
try:
return self.degrees[lookup_key]
except KeyError as e:
raise self.InvalidDegreeNameException(invalid_degree_name=trimmed_name) from e


class CourseTypeImportMapper:
Expand Down Expand Up @@ -160,7 +154,7 @@ class EnrollmentInputRow(InputRow):
location: ExcelFileLocation

# Cells in the order of appearance in a row of an import file
evaluation_degree_names: str
evaluation_degree_name: str

student_last_name: str
student_first_name: str
Expand Down Expand Up @@ -195,9 +189,9 @@ class EnrollmentInputRowMapper:
def __init__(self, importer_log: ImporterLog):
self.importer_log: ImporterLog = importer_log

self.course_type_mapper: CourseTypeImportMapper = CourseTypeImportMapper()
self.degree_mapper: DegreeImportMapper = DegreeImportMapper()
self.is_graded_mapper: IsGradedImportMapper = IsGradedImportMapper()
self.course_type_mapper = CourseTypeImportMapper()
self.degree_mapper = DegreeImportMapper()
self.is_graded_mapper = IsGradedImportMapper()

self.invalid_degrees_tracker: Optional[FirstLocationAndCountTracker] = None
self.invalid_course_types_tracker: Optional[FirstLocationAndCountTracker] = None
Expand Down Expand Up @@ -233,11 +227,10 @@ def _map_row(self, row: EnrollmentInputRow) -> EnrollmentParsedRow:

degrees: MaybeInvalid[Set[Degree]]
try:
degrees = self.degree_mapper.degree_set_from_import_string(row.evaluation_degree_names)
except DegreeImportMapper.InvalidDegreeNamesException as e:
degrees = {self.degree_mapper.degree_from_import_string(row.evaluation_degree_name)}
except DegreeImportMapper.InvalidDegreeNameException as e:
degrees = invalid_value
for invalid_degree in e.invalid_degree_names:
self.invalid_degrees_tracker.add_location_for_key(row.location, invalid_degree)
self.invalid_degrees_tracker.add_location_for_key(row.location, e.invalid_degree_name)

course_type: MaybeInvalid[CourseType]
try:
Expand Down Expand Up @@ -390,13 +383,13 @@ def __init__(self, *args, semester: Semester, **kwargs):
super().__init__(*args, **kwargs)

self.course_merge_logic = CourseMergeLogic(semester)
self.course_merged_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.course_merge_impossible_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.name_de_collision_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.name_en_collision_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.course_merged_tracker = FirstLocationAndCountTracker()
self.course_merge_impossible_tracker = FirstLocationAndCountTracker()
self.name_de_collision_tracker = FirstLocationAndCountTracker()
self.name_en_collision_tracker = FirstLocationAndCountTracker()

self.name_en_by_name_de: Dict[str, str] = {}
self.name_de_mismatch_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.name_de_mismatch_tracker = FirstLocationAndCountTracker()

def check_course_data(self, course_data: CourseData, location: ExcelFileLocation) -> None:
try:
Expand Down Expand Up @@ -479,6 +472,46 @@ def finalize(self) -> None:
)


class SimilarCourseNameChecker(Checker):
"""
Searches for courses that have names with small edit distance and warns about them to make users aware of possible
typos.
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.course_en_tracker = FirstLocationAndCountTracker()
self.course_de_tracker = FirstLocationAndCountTracker()

def check_course_data(self, course_data: CourseData, location: ExcelFileLocation) -> None:
self.course_en_tracker.add_location_for_key(location, course_data.name_en)
self.course_de_tracker.add_location_for_key(location, course_data.name_de)

def finalize(self) -> None:
warning_texts = []

for tracker in [self.course_en_tracker, self.course_de_tracker]:
for needle_name, location_string in tracker.aggregated_keys_and_location_strings():
matches = difflib.get_close_matches(
needle_name,
(name for name in tracker.keys() if name > needle_name),
n=1,
cutoff=settings.IMPORTER_COURSE_NAME_SIMILARITY_WARNING_THRESHOLD,
)
if matches:
warning_texts.append(
_('{location}: The course names "{name1}" and "{name2}" have a low edit distance.').format(
location=location_string,
name1=needle_name,
name2=matches[0],
)
)

for warning_text in warning_texts:
self.importer_log.add_warning(warning_text, category=ImporterLogEntry.Category.SIMILAR_COURSE_NAMES)


class ExistingParticipationChecker(Checker, RowCheckerMixin):
"""Warn if users are already stored as participants for a course in the database"""

Expand Down Expand Up @@ -528,7 +561,7 @@ def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

self.course_data_by_name_en: Dict[str, CourseData] = {}
self.tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.tracker = FirstLocationAndCountTracker()

def check_course_data(self, course_data: CourseData, location: ExcelFileLocation) -> None:
if not all_fields_valid(course_data):
Expand All @@ -555,6 +588,40 @@ def finalize(self) -> None:
)


class UserDegreeMismatchChecker(Checker, RowCheckerMixin):
"""Assert that a users degree is consistent between rows"""

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

self.degree_by_email: Dict[str, Degree] = {}
self.tracker = FirstLocationAndCountTracker()

def check_row(self, row: EnrollmentParsedRow):
if row.student_data.email == "":
return

if isinstance(row.course_data.degrees, InvalidValue):
return

assert len(row.course_data.degrees) == 1, "Checker expected to have courses without merged degrees"
degree = next(iter(row.course_data.degrees))
stored_degree = self.degree_by_email.setdefault(row.student_data.email, degree)

if stored_degree != degree:
self.tracker.add_location_for_key(row.location, row.student_data.email)

def finalize(self) -> None:
for student_email, location_string in self.tracker.aggregated_keys_and_location_strings():
self.importer_log.add_error(
_('{location}: The degree of user "{email}" differs from their degree in a previous row.').format(
location=location_string,
email=student_email,
),
category=ImporterLogEntry.Category.DEGREE,
)


class TooManyEnrollmentsChecker(Checker, RowCheckerMixin):
"""Warn when users exceed settings.IMPORTER_MAX_ENROLLMENTS enrollments"""

Expand Down Expand Up @@ -623,8 +690,10 @@ def import_enrollments(
parsed_rows = EnrollmentInputRowMapper(importer_log).map(input_rows)
for checker in [
TooManyEnrollmentsChecker(test_run, importer_log),
UserDegreeMismatchChecker(test_run, importer_log),
CourseDataAdapter(CourseNameChecker(test_run, importer_log, semester=semester)),
CourseDataAdapter(CourseDataMismatchChecker(test_run, importer_log)),
CourseDataAdapter(SimilarCourseNameChecker(test_run, importer_log)),
UserDataAdapter(UserDataEmptyFieldsChecker(test_run, importer_log)),
UserDataAdapter(UserDataMismatchChecker(test_run, importer_log)),
UserDataAdapter(UserDataValidationChecker(test_run, importer_log)),
Expand All @@ -634,7 +703,7 @@ def import_enrollments(

importer_log.raise_if_has_errors()

user_data_list, course_data_list = normalize_rows(parsed_rows, importer_log)
user_data_list, course_data_list = normalize_rows(parsed_rows)
existing_user_profiles, new_user_profiles = get_user_profile_objects(user_data_list)

responsible_emails = {course_data.responsible_email for course_data in course_data_list}
Expand Down Expand Up @@ -671,9 +740,7 @@ def import_enrollments(
return importer_log


def normalize_rows(
enrollment_rows: Iterable[EnrollmentParsedRow], importer_log: ImporterLog
) -> Tuple[List[UserData], List[ValidCourseData]]:
def normalize_rows(enrollment_rows: Iterable[EnrollmentParsedRow]) -> Tuple[List[UserData], List[ValidCourseData]]:
"""The row schema has denormalized students and evaluations. Normalize / merge them back together"""
user_data_by_email: Dict[str, UserData] = {}
course_data_by_name_en: Dict[str, ValidCourseData] = {}
Expand All @@ -689,17 +756,7 @@ def normalize_rows(
course_data = course_data_by_name_en.setdefault(row.course_data.name_en, row.course_data)
assert course_data.differing_fields(row.course_data) <= {"degrees"}

# Not a checker to keep merging and notifying about the merge together.
if not row.course_data.degrees.issubset(course_data.degrees):
course_data.degrees.update(row.course_data.degrees)

importer_log.add_warning(
_(
'{location}: The degree of course "{course_name}" differs from its degrees in previous rows.'
" All degrees have been set for the course."
).format(location=row.location, course_name=course_data.name_en),
category=ImporterLogEntry.Category.DEGREE,
)
course_data.degrees.update(row.course_data.degrees)

return list(user_data_by_email.values()), list(course_data_by_name_en.values())

Expand Down
10 changes: 5 additions & 5 deletions evap/staff/importers/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def __init__(self, *args, **kwargs) -> None:
# maps user's mail to UserData instance where it was first seen to have O(1) lookup
self.users: Dict[str, UserData] = {}

self.in_file_mismatch_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.in_file_mismatch_tracker = FirstLocationAndCountTracker()

def check_userdata(self, user_data: UserData, location: ExcelFileLocation):
if user_data.email == "":
Expand All @@ -149,10 +149,10 @@ def check_userdata(self, user_data: UserData, location: ExcelFileLocation):

def finalize(self) -> None:
# Mismatches to older rows in the file
for user_email, location in self.in_file_mismatch_tracker.aggregated_keys_and_location_strings():
for email, location in self.in_file_mismatch_tracker.aggregated_keys_and_location_strings():
self.importer_log.add_error(
_("{location}: The users's data (email: {user_email}) is different to a previous row.").format(
location=location, user_email=user_email
_('{location}: The data of user "{email}" differs from their data in a previous row.').format(
location=location, email=email
),
category=ImporterLogEntry.Category.USER,
)
Expand Down Expand Up @@ -282,7 +282,7 @@ def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.first_location_by_user_data: Dict[UserData, ExcelFileLocation] = {}

self.tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
self.tracker = FirstLocationAndCountTracker()

def check_userdata(self, user_data: UserData, location: ExcelFileLocation):
if user_data not in self.first_location_by_user_data:
Expand Down
Loading

0 comments on commit 298c9b8

Please sign in to comment.