Split Dictionary files and load them only as needed (#2484)

* Add script to fetch dict files and convert to wordlists * Add script for dictionary splitting * Add split .dic.js files * Add a dictionaryLoader class to handle the lazy loading of the dictionary parts * Use NFKD unicode normalization to get consistent splitting and lookup
sillsdev · Aug 25, 2023 · 136bb89 · 136bb89
1 parent 594b5b6
commit 136bb89
Show file tree

Hide file tree

Showing 213 changed files with 1,078,412 additions and 288,208 deletions.
diff --git a/.gitignore b/.gitignore
@@ -68,6 +68,11 @@ __pycache__
 *.pyc
 venv
 
+# Intermediate files for dictionary import scripts
+src/resources/dictionaries/*.aff
+src/resources/dictionaries/*.dic
+src/resources/dictionaries/*.txt
+
 # Intermediate and output files for Semantic Domain import scripts
 !deploy/scripts/semantic_domains/xml/*.xml
 deploy/scripts/semantic_domains/json/*.json

diff --git a/.prettierignore b/.prettierignore
@@ -1,5 +1,5 @@
 # These files are very large and slow to format.
-src/resources/dictionaries
+*.dic.js
 docs/user_guide/site
 
 # Prettier cannot handle Helm Templates.

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -31,6 +31,13 @@
     "editor.defaultFormatter": "esbenp.prettier-vscode"
   },
   "files.insertFinalNewline": true,
+  "cSpell.ignorePaths": [
+    "*.dic.js",
+    "node_modules",
+    "package-lock.json",
+    "public/locales/",
+    "!public/locales/en/"
+  ],
   "cSpell.words": [
     "Adposition",
     "axios",
@@ -67,7 +74,9 @@
     "textfile",
     "thecombine",
     "upsert",
-    "venv"
+    "venv",
+    "wordlist",
+    "wordlists"
   ],
   "git.branchProtection": ["master", "main"]
 }
diff --git a/README.md b/README.md
@@ -54,12 +54,13 @@ A rapid word collection tool. See the [User Guide](https://sillsdev.github.io/Th
    4. [Import Semantic Domains](#import-semantic-domains)
    5. [Generate License Reports](#generate-license-reports)
    6. [Inspect Database](#inspect-database)
-   7. [Cleanup Local Repository](#cleanup-local-repository)
+   7. [Add or Update Dictionary Files](#add-or-update-dictionary-files)
+   8. [Cleanup Local Repository](#cleanup-local-repository)
 3. [Setup Local Kubernetes Cluster](#setup-local-kubernetes-cluster)
    1. [Install Rancher Desktop](#install-rancher-desktop)
    2. [Install Docker Desktop](#install-docker-desktop)
    3. [Install Kubernetes Tools](#install-kubernetes-tools)
-4. [Setup The Combine](#setup-the-combine)
+4. [Setup _The Combine_](#setup-the-combine)
    1. [Install Required Charts](#install-required-charts)
    2. [Build _The Combine_ Containers](#build-the-combine-containers)
    3. [Setup Environment Variables](#setup-environment-variables)
@@ -449,6 +450,43 @@ To browse the database locally during development, open MongoDB Compass Communit
 1. Under New Connection, enter `mongodb://localhost:27017`
 2. Under Databases, select CombineDatabase
 
+### Add or Update Dictionary Files
+
+The dictionary files for spell-check functionality in _The Combine_ are split into parts to allow lazy-loading, for the
+sake of devices with limited bandwidth. There are scripts for generating these files in `src/resources/dictionaries/`;
+files in this directory should _not_ be manually edited.
+
+The bash script `scripts/fetch_wordlists.sh` is used to fetch dictionary files for a given language (e.g., `es`) from
+https://cgit.freedesktop.org/libreoffice/dictionaries/ and convert them to raw wordlists (e.g.,
+`src/resources/dictionaries/es.txt`). Execute the script with no arguments for its usage details. Any language not
+currently supported can be manually added as a case in this script.
+
+```bash
+./scripts/fetch_wordlist.sh
+```
+
+The python script `scripts/split_dictionary.py` takes a wordlist textfile (e.g., `src/resources/dictionaries/es.txt`),
+splits it into multiple TypeScript files (e.g., into `src/resources/dictionaries/es/` with index file
+`.../es/index.ts`), and updates `src/resources/dictionaries/index.ts` accordingly. Run the script within a Python
+virtual environment, with `-h`/`--help` to see its usage details.
+
+```bash
+python scripts/split_dictionary.py --help
+```
+
+For some languages, the wordlist is too large for practical use. Generally try to keep the folder for each language
+under 2.5 MB, to avoid such errors as
+`FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory` in the Kubernetes build. For smaller
+folder sizes, default maximum word-lengths are automatically imposed for some languages: (`ar`, `es`, `fr`, `pt`, `ru`).
+Use `-m`/`--max` to override the defaults, with `-m -1` to force no limit.
+
+Adjust the `-t`/`--threshold` and `-T`/`--Threshold` parameters to split a wordlist into more, smaller files; e.g.:
+
+- `python scripts/split_dictionary.py -l hi -t 1000`
+- `python scripts/split_dictionary.py -l sw -t 1500`
+
+The top of each language's `index.ts` file states which values of `-m`, `-t`, and `-T` were used for that language.
+
 ### Cleanup Local Repository
 
 It's sometimes possible for a developer's local temporary state to get out of sync with other developers or CI. This
@@ -545,7 +583,7 @@ links:
 2. [helm](https://helm.sh/docs/intro/install/)
    - On Windows, if using [Chocolatey][chocolatey]: `choco install kubernetes-helm`
 
-## Setup The Combine
+## Setup _The Combine_
 
 This section describes how to build and deploy _The Combine_ to your Kubernetes cluster. Unless specified otherwise, all
 of the commands below are run from _The Combine's_ project directory and are run in an activated Python virtual

diff --git a/lgtm.yml b/lgtm.yml
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -33,7 +33,8 @@
     "test-frontend": "         react-scripts               test",
     "test-frontend:coverage": "react-scripts               test --coverage --watchAll=false",
     "test-frontend:debug": "   react-scripts --inspect-brk test --runInBand --no-cache",
-    "test:ci": "dotnet test Backend.Tests/Backend.Tests.csproj && CI=true react-scripts test --ci --all --testResultsProcessor jest-teamcity-reporter"
+    "test:ci": "dotnet test Backend.Tests/Backend.Tests.csproj && CI=true react-scripts test --ci --all --testResultsProcessor jest-teamcity-reporter",
+    "wordlist": "hunspell-reader words"
   },
   "dependencies": {
     "@emotion/react": "^11.11.0",
@@ -55,6 +56,7 @@
     "distinct-colors": "^3.0.0",
     "history": "^5.3.0",
     "http-status-codes": "^2.1.4",
+    "hunspell-reader": "^7.0.0",
     "i18next": "^23.4.4",
     "i18next-browser-languagedetector": "^7.1.0",
     "i18next-http-backend": "^2.2.1",
@@ -133,6 +135,9 @@
       "browser": true,
       "jest": true
     },
+    "ignorePatterns": [
+      "*.dic.js"
+    ],
     "rules": {
       "@typescript-eslint/no-empty-interface": "warn",
       "@typescript-eslint/no-inferrable-types": "warn",

diff --git a/scripts/fetch_wordlist.sh b/scripts/fetch_wordlist.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+######################################################
+# Script to fetch dictionary files from
+# cgit.freedesktop.org/libreoffice/dictionaries/tree/
+# and convert them into wordlists
+######################################################
+
+set -e
+
+usage() {
+  cat <<USAGE
+  Usage: $0 [options]
+    Fetch dictionary files for specified language and convert to a wordlist
+  Options:
+    -h, --help:
+          print this message
+    -l, --lang:
+          (required) language to generate wordlist for
+          options: ar, en, es, fr, hi, pt, ru, sw
+    -d, --dry-run:
+          print commands instead of executing them
+    -v, --verbose:
+          print each line of code before it is executed
+  Caveats:
+    This script assumes:
+      * internet access
+      * scripts.wordlist defined in package.json
+      * hunspell-reader installed with npm
+    If you run this script many times in rapid succession,
+      your dictionary download may be throttled by the source
+USAGE
+}
+
+if [[ $# -eq 0 ]] ; then
+  usage
+  exit 0
+fi
+
+DRYRUN=0
+LANG=
+while [[ $# -gt 0 ]] ; do
+  arg="$1"
+  shift
+
+  case ${arg} in
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    -l|--lang)
+      LANG=$1
+      shift
+      if [[ "${LANG}" =~ [^a-z] ]]; then
+        echo "The -l/--lang argument must be lowercase alphabetic"
+        exit 1
+      fi
+      ;;
+    -d|--dry-run)
+      DRYRUN=1
+      ;;
+    -v|--verbose)
+      set -x
+      ;;
+    *)
+      echo "Unrecognized argument: ${arg}"
+      usage
+      exit 1
+      ;;
+  esac
+done
+
+SRC=https://cgit.freedesktop.org/libreoffice/dictionaries/plain/
+case ${LANG} in
+  ar)
+    # URL=${SRC}ar/ar
+    echo "The Arabic LibreOffice dictionary generates a 1.5 GB wordlist."
+    echo "Manually download the wordlist from https://sourceforge.net/projects/arabic-wordlist/ instead"
+    echo "and save it to src/resources/dictionaries/ar.txt for use with the split_dictionary.py script."
+    exit 1
+    ;;
+  en)
+    URL=${SRC}en/en_US
+    ;;
+  es)
+    URL=${SRC}es/es
+    ;;
+  fr)
+    URL=${SRC}fr_FR/fr
+    ;;
+  hi)
+    URL=${SRC}hi_IN/hi_IN
+    ;;
+  pt)
+    URL=${SRC}pt_BR/pt_BR
+    ;;
+  ru)
+    URL=${SRC}ru_RU/ru_RU
+    ;;
+  sw)
+    URL=${SRC}sw_TZ/sw_TZ
+    ;;
+  *)
+    echo "Unavailable language: ${LANG}"
+    echo "Options: ar, en, es, fr, hi, pt, ru, sw"
+    exit 1
+    ;;
+esac
+
+DIR=src/resources/dictionaries/
+
+echo "** Fetching .aff file **"
+AFF=${DIR}${LANG}.aff
+cmd="curl -o ${AFF} ${URL}.aff"
+if [[ $DRYRUN -eq 1 ]] ; then
+  echo "$cmd"
+else
+  $cmd
+fi
+
+echo "** Fetching .dic file **"
+DIC=${DIR}${LANG}.dic
+cmd="curl -o ${DIC} ${URL}.dic"
+if [[ $DRYRUN -eq 1 ]] ; then
+  echo "$cmd"
+else
+  $cmd
+fi
+
+echo "** Converting to .txt wordlist **"
+TXT=${DIR}${LANG}.txt
+cmd="npm run wordlist -- ${DIC} -po ${TXT}"
+if [[ $DRYRUN -eq 1 ]] ; then
+  echo "$cmd"
+else
+  $cmd
+fi
+echo "** Wordlist saved to ${TXT} **"
+
+if [[ $DRYRUN -eq 1 ]] ; then
+  echo "rm $AFF"
+  echo "rm $DIC"
+else
+  rm $AFF
+  rm $DIC
+fi
+echo "** Deleted .aff and .dic files **"