Merge pull request #1855 from He3lixxx/import-improvements

Importer improvements: Fix #1574
e-valuation · Feb 20, 2023 · 298c9b8 · 298c9b8
2 parents 2fd6957 + 5eb5086
commit 298c9b8
Show file tree

Hide file tree

Showing 10 changed files with 300 additions and 156 deletions.
diff --git a/evap/evaluation/forms.py b/evap/evaluation/forms.py
@@ -115,9 +115,6 @@ class Meta:
             "delegates": UserModelMultipleChoiceField,
         }
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
     def save(self, *args, **kw):
         super().save(*args, **kw)
         logger.info('User "%s" edited the settings.', self.instance.email)
diff --git a/evap/settings.py b/evap/settings.py
@@ -79,6 +79,9 @@
 # the importer will warn if any participant has more enrollments than this number
 IMPORTER_MAX_ENROLLMENTS = 7
 
+# Cutoff value passed to difflib.get_close_matches() to find typos in course names. Lower values are slower.
+IMPORTER_COURSE_NAME_SIMILARITY_WARNING_THRESHOLD = 0.9
+
 # the default descriptions for grade documents
 DEFAULT_FINAL_GRADES_DESCRIPTION_EN = "Final grades"
 DEFAULT_MIDTERM_GRADES_DESCRIPTION_EN = "Midterm grades"

diff --git a/evap/staff/fixtures/excel_files_test_data.py b/evap/staff/fixtures/excel_files_test_data.py
@@ -59,7 +59,7 @@
         ['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kaufen', 'Buy', 'Dr.', 'Romano', 'Electram', '111@external.example.com'],
         ['Master', 'Synephebos', 'Diam', '', 'Seminar', 'no', 'Zerplatzen', 'Burst', 'Dr.', 'Sadipscing', 'Elitr', '234@external.example.com'],
         ['Diploma', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Vorlesung', 'yes', 'Schneiden', 'Cut', 'Dr.', 'Sic', 'Graecis', '890@external.example.com'],
-        ['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kommen', 'Come', 'Prof. Dr.', 'Takimata', 'Labore', '678@internal.example.com'],
+        ['Bachelor', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kommen', 'Come', 'Prof. Dr.', 'Takimata', 'Labore', '678@internal.example.com'],
         ['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Seminar', 'no', 'Kosten', 'Cost', 'Dr.', 'Aliquyam', 'Sanctus', ''],
         ['Master', 'Synephebos', 'Diam', 'diam.synephebos@institution.example.com', 'Praktikum', 'no', 'Wählen', 'Choose', 'Prof. Dr.', 'Dolor', 'Sit', 'asd@external.example.com'],
         ['Master', 'Lorem', 'Ipsum', 'ipsum.lorem@institution.example.com', 'Vorlesung', 'no', 'Schlagen', 'Beat', 'Prof. Dr.', 'Amet', 'Consetetur', '123@external.example.com'],
@@ -153,7 +153,7 @@
     'MA Belegungen': [
         ['Degree', 'Student last name', 'Student first name', 'Student email address', 'Course kind', 'Course is graded', 'Course name (de)', 'Course name (en)', 'Responsible title', 'Responsible last name', 'Responsible first name', 'Responsible email address'],
         ['Grandmaster', 'Quid', 'Bastius', 'bastius.quid@external.example.com', 'jaminar', 'probably not', 'Bauen', 'Build', '', 'Sed', 'Diam', '345@external.example.com'],
-        ['Beginner,Bachelor', 'Lorem', 'Ipsum', 'ipsum.lorem@institution.example.com', 'jaminar', 'probably not', 'Bauen', 'Build', '', 'Sed', 'Diam', '345@external.example.com'],
+        ['Beginner', 'Lorem', 'Ipsum', 'ipsum.lorem@institution.example.com', 'jaminar', 'probably not', 'Bauen', 'Build', '', 'Sed', 'Diam', '345@external.example.com'],
     ],
 }
 

diff --git a/evap/staff/importers/base.py b/evap/staff/importers/base.py
@@ -48,6 +48,8 @@ class Category(Enum):
             "too_many_enrollments", gettext_lazy("Unusually high number of enrollments"), 14
         )
 
+        SIMILAR_COURSE_NAMES = _CATEGORY_TUPLE("similar_course_names", gettext_lazy("Similar course names"), 15)
+
     level: Level
     category: Category
     message: str
@@ -136,7 +138,7 @@ class ConvertExceptionsToMessages:
     """Shared catch-all exception handlers between importers"""
 
     def __init__(self, importer_log: ImporterLog):
-        self.importer_log: ImporterLog = importer_log
+        self.importer_log = importer_log
 
     def __enter__(self):
         pass
@@ -274,6 +276,9 @@ def aggregated_keys_and_location_strings(self) -> Iterator[Tuple[Any, str]]:
 
             yield key, location_string
 
+    def keys(self) -> Iterable[Any]:
+        return self.first_location_by_key.keys()
+
 
 class Checker:
     def __init__(self, test_run: bool, importer_log: ImporterLog):

diff --git a/evap/staff/importers/enrollment.py b/evap/staff/importers/enrollment.py
@@ -1,3 +1,4 @@
+import difflib
 from collections import defaultdict
 from dataclasses import dataclass, fields
 from datetime import date, datetime
@@ -34,7 +35,7 @@
 )
 
 
-@dataclass
+@dataclass(frozen=True)
 class InvalidValue:
     # We make this a dataclass to make sure all instances compare equal.
     pass
@@ -83,9 +84,9 @@ def all_fields_valid(course_data: CourseData) -> TypeGuard[ValidCourseData]:
 
 
 class DegreeImportMapper:
-    class InvalidDegreeNamesException(Exception):
-        def __init__(self, *args, invalid_degree_names: Set[str], **kwargs):
-            self.invalid_degree_names = invalid_degree_names
+    class InvalidDegreeNameException(Exception):
+        def __init__(self, *args, invalid_degree_name: str, **kwargs):
+            self.invalid_degree_name = invalid_degree_name
             super().__init__(*args, **kwargs)
 
     def __init__(self) -> None:
@@ -95,20 +96,13 @@ def __init__(self) -> None:
             for import_name in degree.import_names
         }
 
-    def degree_set_from_import_string(self, import_string: str) -> Set[Degree]:
-        result = set()
-        invalid_degree_names = set()
-        for degree_name in import_string.split(","):
-            stripped_name = degree_name.strip()
-            try:
-                result.add(self.degrees[stripped_name.lower()])
-            except KeyError:
-                invalid_degree_names.add(stripped_name)
-
-        if invalid_degree_names:
-            raise self.InvalidDegreeNamesException(invalid_degree_names=invalid_degree_names)
-
-        return result
+    def degree_from_import_string(self, import_string: str) -> Degree:
+        trimmed_name = import_string.strip()
+        lookup_key = trimmed_name.lower()
+        try:
+            return self.degrees[lookup_key]
+        except KeyError as e:
+            raise self.InvalidDegreeNameException(invalid_degree_name=trimmed_name) from e
 
 
 class CourseTypeImportMapper:
@@ -160,7 +154,7 @@ class EnrollmentInputRow(InputRow):
     location: ExcelFileLocation
 
     # Cells in the order of appearance in a row of an import file
-    evaluation_degree_names: str
+    evaluation_degree_name: str
 
     student_last_name: str
     student_first_name: str
@@ -195,9 +189,9 @@ class EnrollmentInputRowMapper:
     def __init__(self, importer_log: ImporterLog):
         self.importer_log: ImporterLog = importer_log
 
-        self.course_type_mapper: CourseTypeImportMapper = CourseTypeImportMapper()
-        self.degree_mapper: DegreeImportMapper = DegreeImportMapper()
-        self.is_graded_mapper: IsGradedImportMapper = IsGradedImportMapper()
+        self.course_type_mapper = CourseTypeImportMapper()
+        self.degree_mapper = DegreeImportMapper()
+        self.is_graded_mapper = IsGradedImportMapper()
 
         self.invalid_degrees_tracker: Optional[FirstLocationAndCountTracker] = None
         self.invalid_course_types_tracker: Optional[FirstLocationAndCountTracker] = None
@@ -233,11 +227,10 @@ def _map_row(self, row: EnrollmentInputRow) -> EnrollmentParsedRow:
 
         degrees: MaybeInvalid[Set[Degree]]
         try:
-            degrees = self.degree_mapper.degree_set_from_import_string(row.evaluation_degree_names)
-        except DegreeImportMapper.InvalidDegreeNamesException as e:
+            degrees = {self.degree_mapper.degree_from_import_string(row.evaluation_degree_name)}
+        except DegreeImportMapper.InvalidDegreeNameException as e:
             degrees = invalid_value
-            for invalid_degree in e.invalid_degree_names:
-                self.invalid_degrees_tracker.add_location_for_key(row.location, invalid_degree)
+            self.invalid_degrees_tracker.add_location_for_key(row.location, e.invalid_degree_name)
 
         course_type: MaybeInvalid[CourseType]
         try:
@@ -390,13 +383,13 @@ def __init__(self, *args, semester: Semester, **kwargs):
         super().__init__(*args, **kwargs)
 
         self.course_merge_logic = CourseMergeLogic(semester)
-        self.course_merged_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
-        self.course_merge_impossible_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
-        self.name_de_collision_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
-        self.name_en_collision_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
+        self.course_merged_tracker = FirstLocationAndCountTracker()
+        self.course_merge_impossible_tracker = FirstLocationAndCountTracker()
+        self.name_de_collision_tracker = FirstLocationAndCountTracker()
+        self.name_en_collision_tracker = FirstLocationAndCountTracker()
 
         self.name_en_by_name_de: Dict[str, str] = {}
-        self.name_de_mismatch_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
+        self.name_de_mismatch_tracker = FirstLocationAndCountTracker()
 
     def check_course_data(self, course_data: CourseData, location: ExcelFileLocation) -> None:
         try:
@@ -479,6 +472,46 @@ def finalize(self) -> None:
             )
 
 
+class SimilarCourseNameChecker(Checker):
+    """
+    Searches for courses that have names with small edit distance and warns about them to make users aware of possible
+    typos.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.course_en_tracker = FirstLocationAndCountTracker()
+        self.course_de_tracker = FirstLocationAndCountTracker()
+
+    def check_course_data(self, course_data: CourseData, location: ExcelFileLocation) -> None:
+        self.course_en_tracker.add_location_for_key(location, course_data.name_en)
+        self.course_de_tracker.add_location_for_key(location, course_data.name_de)
+
+    def finalize(self) -> None:
+        warning_texts = []
+
+        for tracker in [self.course_en_tracker, self.course_de_tracker]:
+            for needle_name, location_string in tracker.aggregated_keys_and_location_strings():
+                matches = difflib.get_close_matches(
+                    needle_name,
+                    (name for name in tracker.keys() if name > needle_name),
+                    n=1,
+                    cutoff=settings.IMPORTER_COURSE_NAME_SIMILARITY_WARNING_THRESHOLD,
+                )
+                if matches:
+                    warning_texts.append(
+                        _('{location}: The course names "{name1}" and "{name2}" have a low edit distance.').format(
+                            location=location_string,
+                            name1=needle_name,
+                            name2=matches[0],
+                        )
+                    )
+
+        for warning_text in warning_texts:
+            self.importer_log.add_warning(warning_text, category=ImporterLogEntry.Category.SIMILAR_COURSE_NAMES)
+
+
 class ExistingParticipationChecker(Checker, RowCheckerMixin):
     """Warn if users are already stored as participants for a course in the database"""
 
@@ -528,7 +561,7 @@ def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
         self.course_data_by_name_en: Dict[str, CourseData] = {}
-        self.tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
+        self.tracker = FirstLocationAndCountTracker()
 
     def check_course_data(self, course_data: CourseData, location: ExcelFileLocation) -> None:
         if not all_fields_valid(course_data):
@@ -555,6 +588,40 @@ def finalize(self) -> None:
             )
 
 
+class UserDegreeMismatchChecker(Checker, RowCheckerMixin):
+    """Assert that a users degree is consistent between rows"""
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.degree_by_email: Dict[str, Degree] = {}
+        self.tracker = FirstLocationAndCountTracker()
+
+    def check_row(self, row: EnrollmentParsedRow):
+        if row.student_data.email == "":
+            return
+
+        if isinstance(row.course_data.degrees, InvalidValue):
+            return
+
+        assert len(row.course_data.degrees) == 1, "Checker expected to have courses without merged degrees"
+        degree = next(iter(row.course_data.degrees))
+        stored_degree = self.degree_by_email.setdefault(row.student_data.email, degree)
+
+        if stored_degree != degree:
+            self.tracker.add_location_for_key(row.location, row.student_data.email)
+
+    def finalize(self) -> None:
+        for student_email, location_string in self.tracker.aggregated_keys_and_location_strings():
+            self.importer_log.add_error(
+                _('{location}: The degree of user "{email}" differs from their degree in a previous row.').format(
+                    location=location_string,
+                    email=student_email,
+                ),
+                category=ImporterLogEntry.Category.DEGREE,
+            )
+
+
 class TooManyEnrollmentsChecker(Checker, RowCheckerMixin):
     """Warn when users exceed settings.IMPORTER_MAX_ENROLLMENTS enrollments"""
 
@@ -623,8 +690,10 @@ def import_enrollments(
         parsed_rows = EnrollmentInputRowMapper(importer_log).map(input_rows)
         for checker in [
             TooManyEnrollmentsChecker(test_run, importer_log),
+            UserDegreeMismatchChecker(test_run, importer_log),
             CourseDataAdapter(CourseNameChecker(test_run, importer_log, semester=semester)),
             CourseDataAdapter(CourseDataMismatchChecker(test_run, importer_log)),
+            CourseDataAdapter(SimilarCourseNameChecker(test_run, importer_log)),
             UserDataAdapter(UserDataEmptyFieldsChecker(test_run, importer_log)),
             UserDataAdapter(UserDataMismatchChecker(test_run, importer_log)),
             UserDataAdapter(UserDataValidationChecker(test_run, importer_log)),
@@ -634,7 +703,7 @@ def import_enrollments(
 
         importer_log.raise_if_has_errors()
 
-        user_data_list, course_data_list = normalize_rows(parsed_rows, importer_log)
+        user_data_list, course_data_list = normalize_rows(parsed_rows)
         existing_user_profiles, new_user_profiles = get_user_profile_objects(user_data_list)
 
         responsible_emails = {course_data.responsible_email for course_data in course_data_list}
@@ -671,9 +740,7 @@ def import_enrollments(
     return importer_log
 
 
-def normalize_rows(
-    enrollment_rows: Iterable[EnrollmentParsedRow], importer_log: ImporterLog
-) -> Tuple[List[UserData], List[ValidCourseData]]:
+def normalize_rows(enrollment_rows: Iterable[EnrollmentParsedRow]) -> Tuple[List[UserData], List[ValidCourseData]]:
     """The row schema has denormalized students and evaluations. Normalize / merge them back together"""
     user_data_by_email: Dict[str, UserData] = {}
     course_data_by_name_en: Dict[str, ValidCourseData] = {}
@@ -689,17 +756,7 @@ def normalize_rows(
         course_data = course_data_by_name_en.setdefault(row.course_data.name_en, row.course_data)
         assert course_data.differing_fields(row.course_data) <= {"degrees"}
 
-        # Not a checker to keep merging and notifying about the merge together.
-        if not row.course_data.degrees.issubset(course_data.degrees):
-            course_data.degrees.update(row.course_data.degrees)
-
-            importer_log.add_warning(
-                _(
-                    '{location}: The degree of course "{course_name}" differs from its degrees in previous rows.'
-                    " All degrees have been set for the course."
-                ).format(location=row.location, course_name=course_data.name_en),
-                category=ImporterLogEntry.Category.DEGREE,
-            )
+        course_data.degrees.update(row.course_data.degrees)
 
     return list(user_data_by_email.values()), list(course_data_by_name_en.values())
 

diff --git a/evap/staff/importers/user.py b/evap/staff/importers/user.py
@@ -136,7 +136,7 @@ def __init__(self, *args, **kwargs) -> None:
         # maps user's mail to UserData instance where it was first seen to have O(1) lookup
         self.users: Dict[str, UserData] = {}
 
-        self.in_file_mismatch_tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
+        self.in_file_mismatch_tracker = FirstLocationAndCountTracker()
 
     def check_userdata(self, user_data: UserData, location: ExcelFileLocation):
         if user_data.email == "":
@@ -149,10 +149,10 @@ def check_userdata(self, user_data: UserData, location: ExcelFileLocation):
 
     def finalize(self) -> None:
         # Mismatches to older rows in the file
-        for user_email, location in self.in_file_mismatch_tracker.aggregated_keys_and_location_strings():
+        for email, location in self.in_file_mismatch_tracker.aggregated_keys_and_location_strings():
             self.importer_log.add_error(
-                _("{location}: The users's data (email: {user_email}) is different to a previous row.").format(
-                    location=location, user_email=user_email
+                _('{location}: The data of user "{email}" differs from their data in a previous row.').format(
+                    location=location, email=email
                 ),
                 category=ImporterLogEntry.Category.USER,
             )
@@ -282,7 +282,7 @@ def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.first_location_by_user_data: Dict[UserData, ExcelFileLocation] = {}
 
-        self.tracker: FirstLocationAndCountTracker = FirstLocationAndCountTracker()
+        self.tracker = FirstLocationAndCountTracker()
 
     def check_userdata(self, user_data: UserData, location: ExcelFileLocation):
         if user_data not in self.first_location_by_user_data: