Skip to content

Commit

Permalink
feat(stemmer): adds english stemmer (#109)
Browse files Browse the repository at this point in the history
* feat(stemmer): adds english stemmer
* feat(tokenizer): adds english stop-words list
* feat(tokenizer): adds ability to disable/enable stemming and stop-words
* test(tokenizer): adds tests for custom tokenizer configs
  • Loading branch information
micheleriva authored Aug 29, 2022
1 parent 6094fcc commit 9f5995d
Show file tree
Hide file tree
Showing 13 changed files with 780 additions and 100 deletions.
12 changes: 12 additions & 0 deletions src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,15 @@ export function GETTER_SETTER_WORKS_ON_EDGE_ONLY(method: string): string {
export function RESERVED_PROPERTY_NAME(name: string): string {
return `"${name}" is a reserved property name. Please change it to "__${name}", "${name}__", "_${name}_", or similar.`;
}

export function CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY(): string {
return `Custom stop words array must only contain strings.`;
}

export function CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY(): string {
return `Custom stop words must be a function or an array of strings.`;
}

export function INVALID_STEMMER_FUNCTION_TYPE(): string {
return `tokenizer.stemmingFn property must be a function.`;
}
95 changes: 90 additions & 5 deletions src/lyra.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ import { tokenize } from "./tokenizer";
import { getNanosecondsTime, uniqueId, reservedPropertyNames } from "./utils";
import { Language, SUPPORTED_LANGUAGES } from "./tokenizer/languages";
import type { ResolveSchema, SearchProperties } from "./types";
import { availableStemmers, Stemmer, stemmers } from "./tokenizer/stemmer";
import { create as createNode, Node } from "./prefix-tree/node";
import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie";
import { trackInsertion } from "./insertion-checker";
import { availableStopWords, stopWords } from "./tokenizer/stop-words";

type Index = Record<string, Node>;

Expand All @@ -18,6 +20,20 @@ export type PropertiesSchema = {
[key: string]: PropertyType | PropertiesSchema;
};

export type TokenizerConfig = {
enableStemming?: boolean;
enableStopWords?: boolean;
customStopWords?: ((stopWords: string[]) => string[]) | string[];
stemmingFn?: (word: string) => string;
};

export type TokenizerConfigExec = {
enableStemming: boolean;
enableStopWords: boolean;
customStopWords: string[];
stemmingFn?: (word: string) => string;
};

export type Configuration<S extends PropertiesSchema> = {
/**
* The structure of the document to be inserted into the database.
Expand All @@ -28,6 +44,7 @@ export type Configuration<S extends PropertiesSchema> = {
*/
defaultLanguage?: Language;
edge?: boolean;
tokenizer?: TokenizerConfig;
};

export type Data<S extends PropertiesSchema> = {
Expand All @@ -41,6 +58,7 @@ export interface Lyra<S extends PropertiesSchema> extends Data<S> {
defaultLanguage: Language;
schema: S;
edge: boolean;
tokenizer?: TokenizerConfig;
}

export type InsertConfig = {
Expand Down Expand Up @@ -152,12 +170,13 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
id: string,
config: InsertConfig,
prefix = "",
tokenizerConfig: TokenizerConfig,
) {
for (const key of Object.keys(doc)) {
const isNested = typeof doc[key] === "object";
const propName = `${prefix}${key}`;
if (isNested) {
recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".");
recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".", tokenizerConfig);

return;
}
Expand All @@ -166,7 +185,7 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
// Use propName here because if doc is a nested object
// We will get the wrong index
const requestedTrie = index[propName];
const tokens = tokenize(doc[key] as string, config.language);
const tokens = tokenize(doc[key] as string, config.language, false, tokenizerConfig);

for (const token of tokens) {
trieInsert(nodes, requestedTrie, token, id);
Expand Down Expand Up @@ -247,6 +266,7 @@ export function create<S extends PropertiesSchema>(properties: Configuration<S>)
nodes: {},
index: {},
edge: properties.edge ?? false,
tokenizer: defaultTokenizerConfig(defaultLanguage, properties.tokenizer!),
};

buildIndex(instance, properties.schema);
Expand Down Expand Up @@ -282,7 +302,7 @@ export function insert<S extends PropertiesSchema>(
}

lyra.docs[id] = doc;
recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config);
recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer!);
trackInsertion(lyra);

return { id };
Expand Down Expand Up @@ -358,7 +378,7 @@ export function remove<S extends PropertiesSchema>(lyra: Lyra<S>, docID: string)

if (propertyType === "string") {
const idx = lyra.index[key];
const tokens = tokenize(document[key] as string);
const tokens = tokenize(document[key] as string, lyra.defaultLanguage, false, lyra.tokenizer!);

for (const token of tokens) {
if (token && removeDocumentByWord(lyra.nodes, idx, token, docID)) {
Expand Down Expand Up @@ -394,7 +414,7 @@ export function search<S extends PropertiesSchema>(
language = lyra.defaultLanguage;
}

const tokens = tokenize(params.term, language);
const tokens = tokenize(params.term, language, false, lyra.tokenizer!);
const indices = getIndices(lyra, params.properties);
const uniqueDocIds = new Set<string>();
const { limit = 10, offset = 0, exact = false } = params;
Expand Down Expand Up @@ -466,3 +486,68 @@ export function load<S extends PropertiesSchema>(lyra: Lyra<S>, { index, docs, n
lyra.nodes = nodes;
lyra.schema = schema;
}

export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec {
let defaultStopWords: string[];
let defaultStemmingFn: Stemmer | undefined;

// Enable custom stemming function
if (tokenizerConfig?.stemmingFn) {
if (typeof tokenizerConfig.stemmingFn === "function") {
defaultStemmingFn = tokenizerConfig.stemmingFn;
} else {
throw Error(ERRORS.INVALID_STEMMER_FUNCTION_TYPE());
}
} else {
if (availableStemmers.includes(language)) {
defaultStemmingFn = stemmers[language]!;
} else {
defaultStemmingFn = undefined;
}
}

// Enable default stop-words
if (availableStopWords.includes(language)) {
defaultStopWords = stopWords[language]!;
} else {
defaultStopWords = [];
}

// Enable custom stop-words
let customStopWords: string[] | undefined;

if (tokenizerConfig?.customStopWords) {
switch (typeof tokenizerConfig.customStopWords) {
// Execute the custom step-words function.
// This will pass the default step-words for a given language as a first parameter.
case "function":
customStopWords = tokenizerConfig.customStopWords(defaultStopWords);
break;

// Check if the custom step-words is an array.
// If it's an object, throw an exception. If the array contains any non-string value, throw an exception.
case "object":
if (Array.isArray(tokenizerConfig.customStopWords)) {
if ((tokenizerConfig.customStopWords as string[]).some((x: unknown) => typeof x !== "string")) {
throw Error(ERRORS.CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY());
} else {
customStopWords = tokenizerConfig.customStopWords as string[];
}
} else {
throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY());
}
break;

// By default, throw an exception, as this is a misconfiguration.
default:
throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY());
}
}

return {
enableStopWords: tokenizerConfig?.enableStopWords ?? true,
enableStemming: tokenizerConfig?.enableStemming ?? true,
stemmingFn: defaultStemmingFn,
customStopWords: customStopWords ?? defaultStopWords,
};
}
51 changes: 48 additions & 3 deletions src/tokenizer/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import { Language } from "./languages";
import type { Language } from "./languages";
import type { TokenizerConfig } from "../lyra";
import { defaultTokenizerConfig } from "../lyra";
import { replaceDiacritics } from "./diacritics";
import { stemmers } from "./stemmer";

const splitRegex: Record<Language, RegExp> = {
dutch: /[^a-z0-9_'-]+/gim,
Expand All @@ -13,14 +16,56 @@ const splitRegex: Record<Language, RegExp> = {
swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim,
};

export function tokenize(input: string, language: Language = "english", allowDuplicates = false) {
export const normalizationCache = new Map();

function normalizeToken(token: string, language: Language, tokenizerConfig: TokenizerConfig): string {
const key = `${language}:${token}`;

if (normalizationCache.has(key)) {
return normalizationCache.get(key)!;
} else {
// Check if stop-words removal is enabled
if (tokenizerConfig?.enableStopWords) {
// Remove stop-words
if ((tokenizerConfig?.customStopWords as string[]).includes(token)) {
const token = "";
normalizationCache.set(key, token);
return token;
}
}

// Check if stemming is enabled
if (tokenizerConfig?.enableStemming) {
// Stem token when a stemming function is available
if (typeof tokenizerConfig?.stemmingFn === "function") {
token = tokenizerConfig?.stemmingFn(token);
}
}

token = replaceDiacritics(token);
normalizationCache.set(key, token);
return token;
}
}

export function tokenize(
input: string,
language: Language = "english",
allowDuplicates = false,
tokenizerConfig: TokenizerConfig = defaultTokenizerConfig(language),
) {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}

const splitRule = splitRegex[language];
const tokens = input.toLowerCase().split(splitRule).map(replaceDiacritics);
const tokens = input
.toLowerCase()
.split(splitRule)
.map(token => normalizeToken(token, language, tokenizerConfig!))
.filter(Boolean);

const trimTokens = trim(tokens);

if (!allowDuplicates) {
Expand Down
Loading

0 comments on commit 9f5995d

Please sign in to comment.