feat(stemmer): adds english stemmer (#109)

* feat(stemmer): adds english stemmer * feat(tokenizer): adds english stop-words list * feat(tokenizer): adds ability to disable/enable stemming and stop-words * test(tokenizer): adds tests for custom tokenizer configs
oramasearch · Aug 29, 2022 · 9f5995d · 9f5995d
1 parent 6094fcc
commit 9f5995d
Show file tree

Hide file tree

Showing 13 changed files with 780 additions and 100 deletions.
diff --git a/src/errors.ts b/src/errors.ts
@@ -47,3 +47,15 @@ export function GETTER_SETTER_WORKS_ON_EDGE_ONLY(method: string): string {
 export function RESERVED_PROPERTY_NAME(name: string): string {
   return `"${name}" is a reserved property name. Please change it to "__${name}", "${name}__", "_${name}_", or similar.`;
 }
+
+export function CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY(): string {
+  return `Custom stop words array must only contain strings.`;
+}
+
+export function CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY(): string {
+  return `Custom stop words must be a function or an array of strings.`;
+}
+
+export function INVALID_STEMMER_FUNCTION_TYPE(): string {
+  return `tokenizer.stemmingFn property must be a function.`;
+}
diff --git a/src/lyra.ts b/src/lyra.ts
@@ -3,9 +3,11 @@ import { tokenize } from "./tokenizer";
 import { getNanosecondsTime, uniqueId, reservedPropertyNames } from "./utils";
 import { Language, SUPPORTED_LANGUAGES } from "./tokenizer/languages";
 import type { ResolveSchema, SearchProperties } from "./types";
+import { availableStemmers, Stemmer, stemmers } from "./tokenizer/stemmer";
 import { create as createNode, Node } from "./prefix-tree/node";
 import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie";
 import { trackInsertion } from "./insertion-checker";
+import { availableStopWords, stopWords } from "./tokenizer/stop-words";
 
 type Index = Record<string, Node>;
 
@@ -18,6 +20,20 @@ export type PropertiesSchema = {
   [key: string]: PropertyType | PropertiesSchema;
 };
 
+export type TokenizerConfig = {
+  enableStemming?: boolean;
+  enableStopWords?: boolean;
+  customStopWords?: ((stopWords: string[]) => string[]) | string[];
+  stemmingFn?: (word: string) => string;
+};
+
+export type TokenizerConfigExec = {
+  enableStemming: boolean;
+  enableStopWords: boolean;
+  customStopWords: string[];
+  stemmingFn?: (word: string) => string;
+};
+
 export type Configuration<S extends PropertiesSchema> = {
   /**
    * The structure of the document to be inserted into the database.
@@ -28,6 +44,7 @@ export type Configuration<S extends PropertiesSchema> = {
    */
   defaultLanguage?: Language;
   edge?: boolean;
+  tokenizer?: TokenizerConfig;
 };
 
 export type Data<S extends PropertiesSchema> = {
@@ -41,6 +58,7 @@ export interface Lyra<S extends PropertiesSchema> extends Data<S> {
   defaultLanguage: Language;
   schema: S;
   edge: boolean;
+  tokenizer?: TokenizerConfig;
 }
 
 export type InsertConfig = {
@@ -152,12 +170,13 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
   id: string,
   config: InsertConfig,
   prefix = "",
+  tokenizerConfig: TokenizerConfig,
 ) {
   for (const key of Object.keys(doc)) {
     const isNested = typeof doc[key] === "object";
     const propName = `${prefix}${key}`;
     if (isNested) {
-      recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".");
+      recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".", tokenizerConfig);
 
       return;
     }
@@ -166,7 +185,7 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
       // Use propName here because if doc is a nested object
       // We will get the wrong index
       const requestedTrie = index[propName];
-      const tokens = tokenize(doc[key] as string, config.language);
+      const tokens = tokenize(doc[key] as string, config.language, false, tokenizerConfig);
 
       for (const token of tokens) {
         trieInsert(nodes, requestedTrie, token, id);
@@ -247,6 +266,7 @@ export function create<S extends PropertiesSchema>(properties: Configuration<S>)
     nodes: {},
     index: {},
     edge: properties.edge ?? false,
+    tokenizer: defaultTokenizerConfig(defaultLanguage, properties.tokenizer!),
   };
 
   buildIndex(instance, properties.schema);
@@ -282,7 +302,7 @@ export function insert<S extends PropertiesSchema>(
   }
 
   lyra.docs[id] = doc;
-  recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config);
+  recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer!);
   trackInsertion(lyra);
 
   return { id };
@@ -358,7 +378,7 @@ export function remove<S extends PropertiesSchema>(lyra: Lyra<S>, docID: string)
 
     if (propertyType === "string") {
       const idx = lyra.index[key];
-      const tokens = tokenize(document[key] as string);
+      const tokens = tokenize(document[key] as string, lyra.defaultLanguage, false, lyra.tokenizer!);
 
       for (const token of tokens) {
         if (token && removeDocumentByWord(lyra.nodes, idx, token, docID)) {
@@ -394,7 +414,7 @@ export function search<S extends PropertiesSchema>(
     language = lyra.defaultLanguage;
   }
 
-  const tokens = tokenize(params.term, language);
+  const tokens = tokenize(params.term, language, false, lyra.tokenizer!);
   const indices = getIndices(lyra, params.properties);
   const uniqueDocIds = new Set<string>();
   const { limit = 10, offset = 0, exact = false } = params;
@@ -466,3 +486,68 @@ export function load<S extends PropertiesSchema>(lyra: Lyra<S>, { index, docs, n
   lyra.nodes = nodes;
   lyra.schema = schema;
 }
+
+export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec {
+  let defaultStopWords: string[];
+  let defaultStemmingFn: Stemmer | undefined;
+
+  // Enable custom stemming function
+  if (tokenizerConfig?.stemmingFn) {
+    if (typeof tokenizerConfig.stemmingFn === "function") {
+      defaultStemmingFn = tokenizerConfig.stemmingFn;
+    } else {
+      throw Error(ERRORS.INVALID_STEMMER_FUNCTION_TYPE());
+    }
+  } else {
+    if (availableStemmers.includes(language)) {
+      defaultStemmingFn = stemmers[language]!;
+    } else {
+      defaultStemmingFn = undefined;
+    }
+  }
+
+  // Enable default stop-words
+  if (availableStopWords.includes(language)) {
+    defaultStopWords = stopWords[language]!;
+  } else {
+    defaultStopWords = [];
+  }
+
+  // Enable custom stop-words
+  let customStopWords: string[] | undefined;
+
+  if (tokenizerConfig?.customStopWords) {
+    switch (typeof tokenizerConfig.customStopWords) {
+      // Execute the custom step-words function.
+      // This will pass the default step-words for a given language as a first parameter.
+      case "function":
+        customStopWords = tokenizerConfig.customStopWords(defaultStopWords);
+        break;
+
+      // Check if the custom step-words is an array.
+      // If it's an object, throw an exception. If the array contains any non-string value, throw an exception.
+      case "object":
+        if (Array.isArray(tokenizerConfig.customStopWords)) {
+          if ((tokenizerConfig.customStopWords as string[]).some((x: unknown) => typeof x !== "string")) {
+            throw Error(ERRORS.CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY());
+          } else {
+            customStopWords = tokenizerConfig.customStopWords as string[];
+          }
+        } else {
+          throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY());
+        }
+        break;
+
+      // By default, throw an exception, as this is a misconfiguration.
+      default:
+        throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY());
+    }
+  }
+
+  return {
+    enableStopWords: tokenizerConfig?.enableStopWords ?? true,
+    enableStemming: tokenizerConfig?.enableStemming ?? true,
+    stemmingFn: defaultStemmingFn,
+    customStopWords: customStopWords ?? defaultStopWords,
+  };
+}
diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts
@@ -1,5 +1,8 @@
-import { Language } from "./languages";
+import type { Language } from "./languages";
+import type { TokenizerConfig } from "../lyra";
+import { defaultTokenizerConfig } from "../lyra";
 import { replaceDiacritics } from "./diacritics";
+import { stemmers } from "./stemmer";
 
 const splitRegex: Record<Language, RegExp> = {
   dutch: /[^a-z0-9_'-]+/gim,
@@ -13,14 +16,56 @@ const splitRegex: Record<Language, RegExp> = {
   swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim,
 };
 
-export function tokenize(input: string, language: Language = "english", allowDuplicates = false) {
+export const normalizationCache = new Map();
+
+function normalizeToken(token: string, language: Language, tokenizerConfig: TokenizerConfig): string {
+  const key = `${language}:${token}`;
+
+  if (normalizationCache.has(key)) {
+    return normalizationCache.get(key)!;
+  } else {
+    // Check if stop-words removal is enabled
+    if (tokenizerConfig?.enableStopWords) {
+      // Remove stop-words
+      if ((tokenizerConfig?.customStopWords as string[]).includes(token)) {
+        const token = "";
+        normalizationCache.set(key, token);
+        return token;
+      }
+    }
+
+    // Check if stemming is enabled
+    if (tokenizerConfig?.enableStemming) {
+      // Stem token when a stemming function is available
+      if (typeof tokenizerConfig?.stemmingFn === "function") {
+        token = tokenizerConfig?.stemmingFn(token);
+      }
+    }
+
+    token = replaceDiacritics(token);
+    normalizationCache.set(key, token);
+    return token;
+  }
+}
+
+export function tokenize(
+  input: string,
+  language: Language = "english",
+  allowDuplicates = false,
+  tokenizerConfig: TokenizerConfig = defaultTokenizerConfig(language),
+) {
   /* c8 ignore next 3 */
   if (typeof input !== "string") {
     return [input];
   }
 
   const splitRule = splitRegex[language];
-  const tokens = input.toLowerCase().split(splitRule).map(replaceDiacritics);
+  const tokens = input
+    .toLowerCase()
+    .split(splitRule)
+    .map(token => normalizeToken(token, language, tokenizerConfig!))
+    .filter(Boolean);
+
   const trimTokens = trim(tokens);
 
   if (!allowDuplicates) {