Removed Sastadev dependency and made it optional (#18)

Removed Sastadev dependency and made it optional
UUDigitalHumanitieslab · Nov 29, 2022 · 55fe733 · 55fe733
1 parent d73bad4
commit 55fe733
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -18,10 +18,18 @@ AuChAnn was specifically developed to enhance linguistic data in the form of a t
 ## Getting Started
 
 You can install AuChAnn using pip:
+
 ```bash
 pip install auchann
 ```
 
+You can also optionally install [Sastadev](https://github.com/UUDigitalHumanitieslab/sastadev)
+which is used for detecting inflection errors.
+
+```bash
+pip install auchann[NL]
+```
+
 When installed, the program can be run interactively from the console using the command `auchann` .
 
 ## Import as Library
@@ -55,6 +63,10 @@ settings.calc_distance = lambda original, correction: editdistance.distance(orig
 # Default method detects inflection errors
 settings.detect_error = lambda original, correction: (1, "m") if original == "geloopt" and correction == "liep" else (0, None)
 
+### Sastadev contains a helper function for Dutch which detects inflection errors
+from sastadev.deregularise import detect_error
+settings.detect_error = detect_error
+
 # How many words could be split from one?
 # e.g. das -> da(t) (i)s requires a lookahead of 2
 # hoest -> hoe (i)s (he)t requires a lookahead of 3

diff --git a/auchann/align_words.py b/auchann/align_words.py
@@ -2,24 +2,8 @@
 from enum import Enum, unique
 from auchann.correct_parenthesize import correct_parenthesize
 import auchann.data as data
-from sastadev.deregularise import correctinflection
 import editdistance
 
-chat_errors = {
-    'Overgeneralisation': 'm',
-    'Lacking ge prefix': 'm',
-    'Prefix ge without onset': 'm',
-    'Wrong Overgeneralisation': 'm',
-    'Wrong -en suffix': 'm'
-}
-
-
-def map_error(error_type: str) -> str:
-    try:
-        return chat_errors[error_type]
-    except KeyError:
-        return error_type
-
 
 @unique
 class TokenOperation(Enum):
@@ -165,18 +149,8 @@ def __calc_distance(original: str, correction: str) -> int:
 
             return distance
 
-        def __detect_error(original: str, correction: str) -> Tuple[int, Optional[str]]:
-            error = None
-            for candidate, candidate_error in correctinflection(original):
-                if candidate == correction:
-                    error = map_error(candidate_error)
-            if error is not None:
-                return 1, cast(str, error)
-            else:
-                return 0, None
-
         self.__calc_distance = __calc_distance
-        self.__detect_error = __detect_error
+        self.__detect_error = data.detect_error
 
     @property
     def calc_distance(self):
@@ -225,6 +199,15 @@ def detect_error(self):
 
     @detect_error.setter
     def detect_error(self, method: Callable[[str, str], Tuple[int, Optional[str]]]):
+        """Specify a method to compare a text with a correction which returns
+the desired editing distance and the CHAT error code.
+When no error is returned the returned editing distance will be
+ignored.
+
+Args:
+    method: Callable[[str, str], Tuple[int, Optional[str]]]:
+        (transcribed text, correction) -> (editing distance, error code)
+"""
         self.__detect_error = method
         self.distance_hash = {}
 

diff --git a/auchann/data.py b/auchann/data.py
@@ -754,3 +754,11 @@
              'papap',
              'e>[/]',
              'wawawa']
+
+
+try:
+    from sastadev.deregularise import detect_error  # type: ignore[import]
+
+except ImportError:
+    def detect_error(_: str, __: str):
+        return 0, None
diff --git a/requirements.txt b/requirements.txt
@@ -12,5 +12,3 @@ pyyaml==5.4.1
     # via pyyaml-include
 pyyaml-include==1.2.post2
     # via auchann (setup.py)
-sastadev==0.0.2
-    # via auchann (setup.py)
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='auchann',
-    version='0.1.1',
+    version='0.2.0',
     packages=find_packages(include=['auchann', 'auchann.*']),
     package_data={'auchann': ['py.typed']},
     description='The AuChAnn (Automatic CHAT Annotation) package can generate CHAT annotations based on a transcript-correction pairs of utterances.',
@@ -18,9 +18,11 @@
     install_requires=[
         'chamd>=0.5.8',
         'editdistance',
-        'pyyaml-include',
-        'sastadev'
+        'pyyaml-include'
     ],
+    extras_require={
+        'NL': ['sastadev>=0.03']
+    },
     python_requires='>=3.7',
     zip_safe=True,
     entry_points={

diff --git a/unit-tests/test_align.py b/unit-tests/test_align.py
@@ -1,4 +1,4 @@
-from auchann.align_words import align_words, align_split
+from auchann.align_words import AlignmentSettings, align_words, align_split
 
 
 def test_replace():
@@ -71,6 +71,14 @@ def test_multi_word():
 
 
 def assertAlign(transcript_line: str, correction_line: str, expected_chat_line: str):
-    alignment = align_words(transcript_line, correction_line)
+    settings = AlignmentSettings()
+    def detect_error(original: str, correction: str):
+        if original == "slaapte" and correction == "sliep":
+            return 1, "m"
+        else:
+            return 0, None
+    settings.detect_error = detect_error
+
+    alignment = align_words(transcript_line, correction_line, settings)
     chat_line = str(alignment)
     assert chat_line == expected_chat_line