kurianbenoy · kurianbenoy · Feb 10, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ pip install git+https://github.com/kurianbenoy/whisper_normalizer.git
 
 ## Why should we normalize/standardize text?
 
-- In ASR systems it’s important to normalize the text to reduce error in
+- In ASR systems it’s important to normalize the text to reduce unintentional penalties in
   metrics like WER, CER etc.
 - Text normalization/standardization is process of converting texts in
   different styles into a standardized form, which is a best-effort
@@ -69,9 +69,9 @@ english_normalizer("I'm a little teapot, short and stout. Tip me over and pour m
 
     'i am a little teapot short and stout tip me over and pour me out'
 
-This model extends Whisper_normalizer to support indic languages as
-well. The logic for normalization in Indic languages is derived from
-[indic-nlp-library](https://github.com/anoopkunchukuttan/indic_nlp_library).
+### This model extends Whisper_normalizer to support Indic languages as well. 
+
+The logic for normalization in Indic languages is derived from [indic-nlp-library](https://github.com/anoopkunchukuttan/indic_nlp_library). The logic for Malayalam normalization is expanded beyond the Indic NLP library.
 
 ``` python
 from whisper_normalizer.indic_normalizer import MalayalamNormalizer
@@ -80,4 +80,23 @@ normalizer = MalayalamNormalizer()
 normalizer("എന്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.")
 ```
 
-    'എന്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'
+    'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ.'
+
+### Malayalam Normalization unit test
+
+``` bash
+python ml_test.py
+```
+
+    Passes: 6
+    Failures: 2
+    Input: എൻറെ
+    Expected Output: എന്റെ
+    Actual Output: എൻറെ
+
+    Input: കാൺമാനില്ല
+    Expected Output: കാണ്മാനില്ല
+    Actual Output: കാൺമാനില്ല
+    ----------------------------------------------------------------------
+    Ran 1 test in 0.000s
+    OK
diff --git a/ml_norm.tsv b/ml_norm.tsv
@@ -0,0 +1,8 @@
+ദു:ഖം	ദുഃഖം
+നമ:	നമഃ
+പാല്‍	പാൽ
+പാൽ	പാൽ
+എൻറെ	എന്റെ
+ഹെൻറി	ഹെൻറി
+എന്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.	എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ.
+കാൺമാനില്ല	കാണ്മാനില്ല
diff --git a/ml_test.py b/ml_test.py
@@ -0,0 +1,47 @@
+import unittest
+import os
+from whisper_normalizer.indic_normalizer import MalayalamNormalizer
+
+normalizer = MalayalamNormalizer()
+
+CURR_DIR = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
+
+
+import unittest
+
+class TestNormalizer(unittest.TestCase):
+    def setUp(self):
+        # Read input and output data from ml_norm.tsv
+        self.test_data = []
+        with open(os.path.join(CURR_DIR, "ml_norm.tsv")) as file:
+            for line in file:
+                input_data, expected_output = line.strip().split('\t')
+                self.test_data.append((input_data, expected_output))
+
+    def test_normalizer(self):
+        passes = 0
+        failures = 0
+        failure_list = []
+
+        for input_data, expected_output in self.test_data:
+            # Apply your normalizer to the input data
+            result = normalizer(input_data)
+
+            # Compare the result with the expected output
+            if result == expected_output:
+                passes += 1
+            else:
+                failures += 1
+                failure_list.append((input_data, expected_output, result))
+
+        # Print the list of failures and the number of passes
+        print("Passes:", passes)
+        print("Failures:", failures)
+        for failure in failure_list:
+            print("Input:", failure[0])
+            print("Expected Output:", failure[1])
+            print("Actual Output:", failure[2])
+            print()  # Add a newline for clarity
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/whisper_normalizer/indic_normalizer.py b/whisper_normalizer/indic_normalizer.py
@@ -1058,4 +1058,7 @@ def __call__(self, text: str):
         # correct visarga
         text = re.sub(r"([\u0d00-\u0d7f]):", "\\1\u0d03", text)
 
+        # remove samvruthokaram
+        text = text.replace("\u0d41\u0d4d", "\u0d4d")
+
         return text