barrust · barrust · Apr 26, 2023 · Mar 29, 2023 · Mar 29, 2023 · Mar 29, 2023
diff --git a/README.rst b/README.rst
@@ -33,7 +33,7 @@ list. Those words that are found more often in the frequency list are
 **more likely** the correct results.
 
 ``pyspellchecker`` supports multiple languages including English, Spanish,
-German, French, and Portuguese. For information on how the dictionaries were
+German, French, Portuguese, Arabic and Basque. For information on how the dictionaries were
 created and how they can be updated and improved, please see the
 **Dictionary Creation and Updating** section of the readme!
 
@@ -147,6 +147,7 @@ The currently supported dictionaries are:
 * German        - 'de'
 * Russian       - 'ru'
 * Arabic        - 'ar'
+* Basque        - 'eu'
 * Latvian       - 'lv'
 
 Dictionary Creation and Updating

diff --git a/scripts/build_dictionary.py b/scripts/build_dictionary.py
@@ -12,6 +12,7 @@
             Portuguese Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt.gz
             Russian Input:    http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ru.gz
             Arabic Input:     http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz
+            Basque Input:     http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.au.gz
             Latvian Input:    https://huggingface.co/datasets/RaivisDejus/latvian-text
     Requirements:
             The script requires more than the standard library to run in its
@@ -647,6 +648,61 @@ def clean_arabic(word_frequency, filepath_exclude, filepath_include):
 
     return word_frequency
 
+def clean_basque(word_frequency, filepath_exclude, filepath_include):
+    """Clean a Basque word frequency list
+
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
+    """
+    letters = set("abcdefghijklmnopqrstuvwxyzñ")
+
+    # fix issues with words containing other characters
+    invalid_chars = list()
+    for key in word_frequency:
+        kl = set(key)
+        if kl.issubset(letters):
+            continue
+        invalid_chars.append(key)
+    for misfit in invalid_chars:
+        word_frequency.pop(misfit)
+
+    # remove words that start with a double a ("aa")
+    double_a = list()
+    for key in word_frequency:
+        if key.startswith("aa"):
+            double_a.append(key)
+    for misfit in double_a:
+        word_frequency.pop(misfit)
+
+    # TODO: other possible fixes?
+
+    # remove small numbers
+    small_frequency = list()
+    for key in word_frequency:
+        if word_frequency[key] <= MINIMUM_FREQUENCY:
+            small_frequency.append(key)
+    for misfit in small_frequency:
+        word_frequency.pop(misfit)
+
+    # remove flagged misspellings
+    with load_file(filepath_exclude) as fobj:
+        for line in fobj:
+            line = line.strip()
+            if line in word_frequency:
+                word_frequency.pop(line)
+
+    # Add known missing words back in (ugh)
+    with load_file(filepath_include) as fobj:
+        for line in fobj:
+            line = line.strip()
+            if line in word_frequency:
+                print("{} is already found in the dictionary! Skipping!")
+            else:
+                word_frequency[line] = MINIMUM_FREQUENCY
+
+    return word_frequency
 
 def clean_latvian(word_frequency, filepath_exclude, filepath_include):
     """Clean a Latvian word frequency list
@@ -741,7 +797,7 @@ def _parse_args():
         description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
     )
     parser.add_argument(
-        "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv"]
+        "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv", "eu"]
     )
     parser.add_argument(
         "-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
@@ -817,6 +873,8 @@ def _parse_args():
         word_frequency = clean_russian(word_frequency, exclude_filepath, include_filepath)
     elif args.language == "ar":
         word_frequency = clean_arabic(word_frequency, exclude_filepath, include_filepath)
+    elif args.language == "eu":
+        word_frequency = clean_basque(word_frequency, exclude_filepath, include_filepath)
     elif args.language == "lv":
         word_frequency = clean_latvian(word_frequency, exclude_filepath, include_filepath)
 

diff --git a/scripts/data/eu_exclude.txt b/scripts/data/eu_exclude.txt
diff --git a/scripts/data/eu_include.txt b/scripts/data/eu_include.txt
diff --git a/spellchecker/resources/eu.json.gz b/spellchecker/resources/eu.json.gz