Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(stemmer): adds english stemmer #109

Merged
merged 4 commits into from
Aug 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,15 @@ export function GETTER_SETTER_WORKS_ON_EDGE_ONLY(method: string): string {
export function RESERVED_PROPERTY_NAME(name: string): string {
return `"${name}" is a reserved property name. Please change it to "__${name}", "${name}__", "_${name}_", or similar.`;
}

export function CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY(): string {
return `Custom stop words array must only contain strings.`;
}

export function CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY(): string {
return `Custom stop words must be a function or an array of strings.`;
}

export function INVALID_STEMMER_FUNCTION_TYPE(): string {
return `tokenizer.stemmingFn property must be a function.`;
}
95 changes: 90 additions & 5 deletions src/lyra.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ import { tokenize } from "./tokenizer";
import { getNanosecondsTime, uniqueId, reservedPropertyNames } from "./utils";
import { Language, SUPPORTED_LANGUAGES } from "./tokenizer/languages";
import type { ResolveSchema, SearchProperties } from "./types";
import { availableStemmers, Stemmer, stemmers } from "./tokenizer/stemmer";
import { create as createNode, Node } from "./prefix-tree/node";
import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie";
import { trackInsertion } from "./insertion-checker";
import { availableStopWords, stopWords } from "./tokenizer/stop-words";

type Index = Record<string, Node>;

Expand All @@ -18,6 +20,20 @@ export type PropertiesSchema = {
[key: string]: PropertyType | PropertiesSchema;
};

export type TokenizerConfig = {
enableStemming?: boolean;
enableStopWords?: boolean;
customStopWords?: ((stopWords: string[]) => string[]) | string[];
stemmingFn?: (word: string) => string;
};

export type TokenizerConfigExec = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can streamline this as export type TokenizerConfigExec = Required<TokenizerConfig>

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, one problem is that stemmingFn can be undefined (to fallback to the default stemming function). Required is not compatible with undefined values

enableStemming: boolean;
enableStopWords: boolean;
customStopWords: string[];
stemmingFn?: (word: string) => string;
};

export type Configuration<S extends PropertiesSchema> = {
/**
* The structure of the document to be inserted into the database.
Expand All @@ -28,6 +44,7 @@ export type Configuration<S extends PropertiesSchema> = {
*/
defaultLanguage?: Language;
edge?: boolean;
tokenizer?: TokenizerConfig;
};

export type Data<S extends PropertiesSchema> = {
Expand All @@ -41,6 +58,7 @@ export interface Lyra<S extends PropertiesSchema> extends Data<S> {
defaultLanguage: Language;
schema: S;
edge: boolean;
tokenizer?: TokenizerConfig;
}

export type InsertConfig = {
Expand Down Expand Up @@ -152,12 +170,13 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
id: string,
config: InsertConfig,
prefix = "",
tokenizerConfig: TokenizerConfig,
) {
for (const key of Object.keys(doc)) {
const isNested = typeof doc[key] === "object";
const propName = `${prefix}${key}`;
if (isNested) {
recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".");
recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".", tokenizerConfig);

return;
}
Expand All @@ -166,7 +185,7 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
// Use propName here because if doc is a nested object
// We will get the wrong index
const requestedTrie = index[propName];
const tokens = tokenize(doc[key] as string, config.language);
const tokens = tokenize(doc[key] as string, config.language, false, tokenizerConfig);

for (const token of tokens) {
trieInsert(nodes, requestedTrie, token, id);
Expand Down Expand Up @@ -247,6 +266,7 @@ export function create<S extends PropertiesSchema>(properties: Configuration<S>)
nodes: {},
index: {},
edge: properties.edge ?? false,
tokenizer: defaultTokenizerConfig(defaultLanguage, properties.tokenizer!),
};

buildIndex(instance, properties.schema);
Expand Down Expand Up @@ -282,7 +302,7 @@ export function insert<S extends PropertiesSchema>(
}

lyra.docs[id] = doc;
recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config);
recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer!);
trackInsertion(lyra);

return { id };
Expand Down Expand Up @@ -358,7 +378,7 @@ export function remove<S extends PropertiesSchema>(lyra: Lyra<S>, docID: string)

if (propertyType === "string") {
const idx = lyra.index[key];
const tokens = tokenize(document[key] as string);
const tokens = tokenize(document[key] as string, lyra.defaultLanguage, false, lyra.tokenizer!);

for (const token of tokens) {
if (token && removeDocumentByWord(lyra.nodes, idx, token, docID)) {
Expand Down Expand Up @@ -394,7 +414,7 @@ export function search<S extends PropertiesSchema>(
language = lyra.defaultLanguage;
}

const tokens = tokenize(params.term, language);
const tokens = tokenize(params.term, language, false, lyra.tokenizer!);
const indices = getIndices(lyra, params.properties);
const uniqueDocIds = new Set<string>();
const { limit = 10, offset = 0, exact = false } = params;
Expand Down Expand Up @@ -466,3 +486,68 @@ export function load<S extends PropertiesSchema>(lyra: Lyra<S>, { index, docs, n
lyra.nodes = nodes;
lyra.schema = schema;
}

export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec {
let defaultStopWords: string[];
let defaultStemmingFn: Stemmer | undefined;

// Enable custom stemming function
if (tokenizerConfig?.stemmingFn) {
if (typeof tokenizerConfig.stemmingFn === "function") {
defaultStemmingFn = tokenizerConfig.stemmingFn;
} else {
throw Error(ERRORS.INVALID_STEMMER_FUNCTION_TYPE());
}
} else {
if (availableStemmers.includes(language)) {
defaultStemmingFn = stemmers[language]!;
} else {
defaultStemmingFn = undefined;
}
}

// Enable default stop-words
if (availableStopWords.includes(language)) {
defaultStopWords = stopWords[language]!;
} else {
defaultStopWords = [];
}

// Enable custom stop-words
let customStopWords: string[] | undefined;

if (tokenizerConfig?.customStopWords) {
switch (typeof tokenizerConfig.customStopWords) {
// Execute the custom step-words function.
// This will pass the default step-words for a given language as a first parameter.
case "function":
customStopWords = tokenizerConfig.customStopWords(defaultStopWords);
break;

// Check if the custom step-words is an array.
// If it's an object, throw an exception. If the array contains any non-string value, throw an exception.
case "object":
if (Array.isArray(tokenizerConfig.customStopWords)) {
if ((tokenizerConfig.customStopWords as string[]).some((x: unknown) => typeof x !== "string")) {
throw Error(ERRORS.CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY());
} else {
customStopWords = tokenizerConfig.customStopWords as string[];
}
} else {
throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY());
}
break;

// By default, throw an exception, as this is a misconfiguration.
default:
throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY());
}
}

return {
enableStopWords: tokenizerConfig?.enableStopWords ?? true,
enableStemming: tokenizerConfig?.enableStemming ?? true,
stemmingFn: defaultStemmingFn,
customStopWords: customStopWords ?? defaultStopWords,
};
}
51 changes: 48 additions & 3 deletions src/tokenizer/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import { Language } from "./languages";
import type { Language } from "./languages";
import type { TokenizerConfig } from "../lyra";
import { defaultTokenizerConfig } from "../lyra";
import { replaceDiacritics } from "./diacritics";
import { stemmers } from "./stemmer";

const splitRegex: Record<Language, RegExp> = {
dutch: /[^a-z0-9_'-]+/gim,
Expand All @@ -13,14 +16,56 @@ const splitRegex: Record<Language, RegExp> = {
swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim,
};

export function tokenize(input: string, language: Language = "english", allowDuplicates = false) {
export const normalizationCache = new Map();

function normalizeToken(token: string, language: Language, tokenizerConfig: TokenizerConfig): string {
const key = `${language}:${token}`;

if (normalizationCache.has(key)) {
return normalizationCache.get(key)!;
} else {
// Check if stop-words removal is enabled
if (tokenizerConfig?.enableStopWords) {
// Remove stop-words
if ((tokenizerConfig?.customStopWords as string[]).includes(token)) {
const token = "";
normalizationCache.set(key, token);
return token;
}
}

// Check if stemming is enabled
if (tokenizerConfig?.enableStemming) {
// Stem token when a stemming function is available
if (typeof tokenizerConfig?.stemmingFn === "function") {
token = tokenizerConfig?.stemmingFn(token);
}
}

token = replaceDiacritics(token);
normalizationCache.set(key, token);
return token;
}
}

export function tokenize(
input: string,
language: Language = "english",
allowDuplicates = false,
tokenizerConfig: TokenizerConfig = defaultTokenizerConfig(language),
) {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}

const splitRule = splitRegex[language];
const tokens = input.toLowerCase().split(splitRule).map(replaceDiacritics);
const tokens = input
.toLowerCase()
.split(splitRule)
.map(token => normalizeToken(token, language, tokenizerConfig!))
.filter(Boolean);

const trimTokens = trim(tokens);

if (!allowDuplicates) {
Expand Down
Loading