diff --git a/src/errors.ts b/src/errors.ts index c75eda439..a52fd867d 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -47,3 +47,15 @@ export function GETTER_SETTER_WORKS_ON_EDGE_ONLY(method: string): string { export function RESERVED_PROPERTY_NAME(name: string): string { return `"${name}" is a reserved property name. Please change it to "__${name}", "${name}__", "_${name}_", or similar.`; } + +export function CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY(): string { + return `Custom stop words array must only contain strings.`; +} + +export function CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY(): string { + return `Custom stop words must be a function or an array of strings.`; +} + +export function INVALID_STEMMER_FUNCTION_TYPE(): string { + return `tokenizer.stemmingFn property must be a function.`; +} diff --git a/src/lyra.ts b/src/lyra.ts index 8762720cc..89e548685 100644 --- a/src/lyra.ts +++ b/src/lyra.ts @@ -3,9 +3,11 @@ import { tokenize } from "./tokenizer"; import { getNanosecondsTime, uniqueId, reservedPropertyNames } from "./utils"; import { Language, SUPPORTED_LANGUAGES } from "./tokenizer/languages"; import type { ResolveSchema, SearchProperties } from "./types"; +import { availableStemmers, Stemmer, stemmers } from "./tokenizer/stemmer"; import { create as createNode, Node } from "./prefix-tree/node"; import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie"; import { trackInsertion } from "./insertion-checker"; +import { availableStopWords, stopWords } from "./tokenizer/stop-words"; type Index = Record; @@ -18,6 +20,20 @@ export type PropertiesSchema = { [key: string]: PropertyType | PropertiesSchema; }; +export type TokenizerConfig = { + enableStemming?: boolean; + enableStopWords?: boolean; + customStopWords?: ((stopWords: string[]) => string[]) | string[]; + stemmingFn?: (word: string) => string; +}; + +export type TokenizerConfigExec = { + enableStemming: boolean; + enableStopWords: boolean; + customStopWords: string[]; + stemmingFn?: (word: string) => string; +}; + export type Configuration = { /** * The structure of the document to be inserted into the database. @@ -28,6 +44,7 @@ export type Configuration = { */ defaultLanguage?: Language; edge?: boolean; + tokenizer?: TokenizerConfig; }; export type Data = { @@ -41,6 +58,7 @@ export interface Lyra extends Data { defaultLanguage: Language; schema: S; edge: boolean; + tokenizer?: TokenizerConfig; } export type InsertConfig = { @@ -152,12 +170,13 @@ function recursiveTrieInsertion( id: string, config: InsertConfig, prefix = "", + tokenizerConfig: TokenizerConfig, ) { for (const key of Object.keys(doc)) { const isNested = typeof doc[key] === "object"; const propName = `${prefix}${key}`; if (isNested) { - recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema, id, config, propName + "."); + recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema, id, config, propName + ".", tokenizerConfig); return; } @@ -166,7 +185,7 @@ function recursiveTrieInsertion( // Use propName here because if doc is a nested object // We will get the wrong index const requestedTrie = index[propName]; - const tokens = tokenize(doc[key] as string, config.language); + const tokens = tokenize(doc[key] as string, config.language, false, tokenizerConfig); for (const token of tokens) { trieInsert(nodes, requestedTrie, token, id); @@ -247,6 +266,7 @@ export function create(properties: Configuration) nodes: {}, index: {}, edge: properties.edge ?? false, + tokenizer: defaultTokenizerConfig(defaultLanguage, properties.tokenizer!), }; buildIndex(instance, properties.schema); @@ -282,7 +302,7 @@ export function insert( } lyra.docs[id] = doc; - recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config); + recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer!); trackInsertion(lyra); return { id }; @@ -358,7 +378,7 @@ export function remove(lyra: Lyra, docID: string) if (propertyType === "string") { const idx = lyra.index[key]; - const tokens = tokenize(document[key] as string); + const tokens = tokenize(document[key] as string, lyra.defaultLanguage, false, lyra.tokenizer!); for (const token of tokens) { if (token && removeDocumentByWord(lyra.nodes, idx, token, docID)) { @@ -394,7 +414,7 @@ export function search( language = lyra.defaultLanguage; } - const tokens = tokenize(params.term, language); + const tokens = tokenize(params.term, language, false, lyra.tokenizer!); const indices = getIndices(lyra, params.properties); const uniqueDocIds = new Set(); const { limit = 10, offset = 0, exact = false } = params; @@ -466,3 +486,68 @@ export function load(lyra: Lyra, { index, docs, n lyra.nodes = nodes; lyra.schema = schema; } + +export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec { + let defaultStopWords: string[]; + let defaultStemmingFn: Stemmer | undefined; + + // Enable custom stemming function + if (tokenizerConfig?.stemmingFn) { + if (typeof tokenizerConfig.stemmingFn === "function") { + defaultStemmingFn = tokenizerConfig.stemmingFn; + } else { + throw Error(ERRORS.INVALID_STEMMER_FUNCTION_TYPE()); + } + } else { + if (availableStemmers.includes(language)) { + defaultStemmingFn = stemmers[language]!; + } else { + defaultStemmingFn = undefined; + } + } + + // Enable default stop-words + if (availableStopWords.includes(language)) { + defaultStopWords = stopWords[language]!; + } else { + defaultStopWords = []; + } + + // Enable custom stop-words + let customStopWords: string[] | undefined; + + if (tokenizerConfig?.customStopWords) { + switch (typeof tokenizerConfig.customStopWords) { + // Execute the custom step-words function. + // This will pass the default step-words for a given language as a first parameter. + case "function": + customStopWords = tokenizerConfig.customStopWords(defaultStopWords); + break; + + // Check if the custom step-words is an array. + // If it's an object, throw an exception. If the array contains any non-string value, throw an exception. + case "object": + if (Array.isArray(tokenizerConfig.customStopWords)) { + if ((tokenizerConfig.customStopWords as string[]).some((x: unknown) => typeof x !== "string")) { + throw Error(ERRORS.CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY()); + } else { + customStopWords = tokenizerConfig.customStopWords as string[]; + } + } else { + throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY()); + } + break; + + // By default, throw an exception, as this is a misconfiguration. + default: + throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY()); + } + } + + return { + enableStopWords: tokenizerConfig?.enableStopWords ?? true, + enableStemming: tokenizerConfig?.enableStemming ?? true, + stemmingFn: defaultStemmingFn, + customStopWords: customStopWords ?? defaultStopWords, + }; +} diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts index 875d4536e..13d7f26af 100644 --- a/src/tokenizer/index.ts +++ b/src/tokenizer/index.ts @@ -1,5 +1,8 @@ -import { Language } from "./languages"; +import type { Language } from "./languages"; +import type { TokenizerConfig } from "../lyra"; +import { defaultTokenizerConfig } from "../lyra"; import { replaceDiacritics } from "./diacritics"; +import { stemmers } from "./stemmer"; const splitRegex: Record = { dutch: /[^a-z0-9_'-]+/gim, @@ -13,14 +16,56 @@ const splitRegex: Record = { swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim, }; -export function tokenize(input: string, language: Language = "english", allowDuplicates = false) { +export const normalizationCache = new Map(); + +function normalizeToken(token: string, language: Language, tokenizerConfig: TokenizerConfig): string { + const key = `${language}:${token}`; + + if (normalizationCache.has(key)) { + return normalizationCache.get(key)!; + } else { + // Check if stop-words removal is enabled + if (tokenizerConfig?.enableStopWords) { + // Remove stop-words + if ((tokenizerConfig?.customStopWords as string[]).includes(token)) { + const token = ""; + normalizationCache.set(key, token); + return token; + } + } + + // Check if stemming is enabled + if (tokenizerConfig?.enableStemming) { + // Stem token when a stemming function is available + if (typeof tokenizerConfig?.stemmingFn === "function") { + token = tokenizerConfig?.stemmingFn(token); + } + } + + token = replaceDiacritics(token); + normalizationCache.set(key, token); + return token; + } +} + +export function tokenize( + input: string, + language: Language = "english", + allowDuplicates = false, + tokenizerConfig: TokenizerConfig = defaultTokenizerConfig(language), +) { /* c8 ignore next 3 */ if (typeof input !== "string") { return [input]; } const splitRule = splitRegex[language]; - const tokens = input.toLowerCase().split(splitRule).map(replaceDiacritics); + const tokens = input + .toLowerCase() + .split(splitRule) + .map(token => normalizeToken(token, language, tokenizerConfig!)) + .filter(Boolean); + const trimTokens = trim(tokens); if (!allowDuplicates) { diff --git a/src/tokenizer/stemmer/en.ts b/src/tokenizer/stemmer/en.ts new file mode 100644 index 000000000..6157aa350 --- /dev/null +++ b/src/tokenizer/stemmer/en.ts @@ -0,0 +1,189 @@ +const step2List = { + ational: "ate", + tional: "tion", + enci: "ence", + anci: "ance", + izer: "ize", + bli: "ble", + alli: "al", + entli: "ent", + eli: "e", + ousli: "ous", + ization: "ize", + ation: "ate", + ator: "ate", + alism: "al", + iveness: "ive", + fulness: "ful", + ousness: "ous", + aliti: "al", + iviti: "ive", + biliti: "ble", + logi: "log", +}; + +const step3List = { + icate: "ic", + ative: "", + alize: "al", + iciti: "ic", + ical: "ic", + ful: "", + ness: "", +}; + +// Consonant +const c = "[^aeiou]"; +// Vowel +const v = "[aeiouy]"; +// Consonant sequence +const C = c + "[^aeiouy]*"; +// Vowel sequence +const V = v + "[aeiou]*"; + +// [C]VC... is m>0 +const mgr0 = "^(" + C + ")?" + V + C; +// [C]VC[V] is m=1 +const meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; +// [C]VCVC... is m>1 +const mgr1 = "^(" + C + ")?" + V + C + V + C; +// vowel in stem +const s_v = "^(" + C + ")?" + v; + +export function stem(w: string): string { + let stem: string; + let suffix: string; + let re: RegExp; + let re2: RegExp; + let re3: RegExp; + let re4: RegExp; + + if (w.length < 3) { + return w; + } + + const firstch = w.substring(0, 1); + if (firstch == "y") { + w = firstch.toUpperCase() + w.substring(1); + } + + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) { + w = w.replace(re, "$1$2"); + } else if (re2.test(w)) { + w = w.replace(re2, "$1$2"); + } + + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re, ""); + } + } else if (re2.test(w)) { + const fp = re2.exec(w)!; + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) { + w = w + "e"; + } else if (re3.test(w)) { + re = /.$/; + w = w.replace(re, ""); + } else if (re4.test(w)) { + w = w + "e"; + } + } + } + + re = /^(.+?)y$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + re = new RegExp(s_v); + if (stem && re.test(stem)) { + w = stem + "i"; + } + } + + re = + /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + suffix = fp?.[2]; + re = new RegExp(mgr0); + if (stem && re.test(stem)) { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + w = stem + step2List[suffix]; + } + } + + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + suffix = fp?.[2]; + re = new RegExp(mgr0); + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + if (stem && re.test(stem)) { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + w = stem + step3List[suffix]; + } + } + + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + re = new RegExp(mgr1); + if (stem && re.test(stem)) { + w = stem; + } + } else if (re2.test(w)) { + const fp = re2.exec(w)!; + stem = fp?.[1] ?? "" + fp?.[2] ?? ""; + re2 = new RegExp(mgr1); + if (re2.test(stem)) { + w = stem; + } + } + + re = /^(.+?)e$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (stem && (re.test(stem) || (re2.test(stem) && !re3.test(stem)))) { + w = stem; + } + } + + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re, ""); + } + + if (firstch == "y") { + w = firstch.toLowerCase() + w.substring(1); + } + + return w; +} diff --git a/src/tokenizer/stemmer/index.ts b/src/tokenizer/stemmer/index.ts new file mode 100644 index 000000000..0beef0b71 --- /dev/null +++ b/src/tokenizer/stemmer/index.ts @@ -0,0 +1,14 @@ +import type { Language } from "../languages"; +import { stem as ENStemmer } from "./en"; + +export type Stemmer = (word: string) => string; + +type StemmerMap = { + [key in Language]: Stemmer; +}; + +export const stemmers: Partial = { + english: ENStemmer, +}; + +export const availableStemmers = Object.keys(stemmers); diff --git a/src/tokenizer/stop-words/en.ts b/src/tokenizer/stop-words/en.ts new file mode 100644 index 000000000..cced16a13 --- /dev/null +++ b/src/tokenizer/stop-words/en.ts @@ -0,0 +1,204 @@ +export const en = [ + "i", + "me", + "my", + "myself", + "we", + "us", + "our", + "ours", + "ourselves", + + "you", + "your", + "yours", + "yourself", + "yourselves", + + "he", + "him", + "his", + "himself", + + "she", + "her", + "hers", + "herself", + "it", + "its", + "itself", + + "they", + "them", + "their", + "theirs", + "themselves", + + "what", + "which", + "who", + "whom", + "this", + "that", + "these", + "those", + + "am", + "is", + "are", + "was", + "were", + "be", + "been", + "being", + + "have", + "has", + "had", + "having", + + "do", + "does", + "did", + "doing", + + "will", + "would", + + "shall", + "should", + "can", + "could", + + "may", + "might", + "must", + "ought", + + "i'm", + "you're", + "he's", + "she's", + "it's", + "we're", + "they're", + "i've", + "you've", + "we've", + "they've", + "i'd", + "you'd", + "he'd", + "she'd", + "we'd", + "they'd", + "i'll", + "you'll", + "he'll", + "she'll", + "we'll", + "they'll", + + "isn't", + "aren't", + "wasn't", + "weren't", + "hasn't", + "haven't", + "hadn't", + "doesn't", + "don't", + "didn't", + + "won't", + "wouldn't", + "shan't", + "shouldn't", + "can't", + "cannot", + "couldn't", + "mustn't", + + "let's", + "that's", + "who's", + "what's", + "here's", + "there's", + "when's", + "where's", + "why's", + "how's", + + "an", + "the", + + "and", + "but", + "if", + "or", + "because", + "as", + "until", + "while", + + "of", + "at", + "by", + "for", + "with", + "about", + "against", + "between", + "into", + "through", + "during", + "before", + "after", + "above", + "below", + "to", + "from", + "up", + "down", + "in", + "out", + "on", + "off", + "over", + "under", + + "again", + "further", + "then", + "once", + + "here", + "there", + "when", + "where", + "why", + "how", + + "all", + "any", + "both", + "each", + "few", + "more", + "most", + "other", + "some", + "such", + + "no", + "nor", + "not", + "only", + "own", + "same", + "so", + "than", + "too", + "very", +]; diff --git a/src/tokenizer/stop-words/index.ts b/src/tokenizer/stop-words/index.ts new file mode 100644 index 000000000..54d1464fc --- /dev/null +++ b/src/tokenizer/stop-words/index.ts @@ -0,0 +1,12 @@ +import type { Language } from "../languages"; +import { en } from "./en"; + +type StopWordsMap = { + [key in Language]: string[]; +}; + +export const stopWords: Partial = { + english: en, +}; + +export const availableStopWords = Object.keys(stopWords); diff --git a/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs b/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs index e47566f9d..188c48d18 100644 --- a/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs +++ b/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs @@ -7,7 +7,7 @@ 'use strict' exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-1 1`] = ` Object { - "count": 2240, + "count": 2357, "elapsed": 0n, "hits": Array [ Object { @@ -80,6 +80,16 @@ Object { "granularity": "year", "id": "", }, + Object { + "categories": Object { + "first": "By place", + "second": "Egypt", + }, + "date": "-276", + "description": "The first of the Syrian Wars starts between Egypt's Ptolemy II and Seleucid emperor Antiochus I Soter. The Egyptians invade northern Syria, but Antiochus defeats and repels his opponent's army.", + "granularity": "year", + "id": "", + }, Object { "categories": Object { "first": "By place", @@ -100,6 +110,15 @@ Object { "granularity": "year", "id": "", }, + ], +} +` + +exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-2 1`] = ` +Object { + "count": 2357, + "elapsed": 0n, + "hits": Array [ Object { "categories": Object { "first": "By place", @@ -110,15 +129,6 @@ Object { "granularity": "year", "id": "", }, - ], -} -` - -exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-2 1`] = ` -Object { - "count": 2240, - "elapsed": 0n, - "hits": Array [ Object { "categories": Object { "first": "By place", @@ -129,6 +139,16 @@ Object { "granularity": "year", "id": "", }, + Object { + "categories": Object { + "first": "By place", + "second": "China", + }, + "date": "-260", + "description": "In the Battle of Changping, the army of the Qin state routs the army of Zhao, establishing its military superiority over all other Chinese states during the Warring States Period. The battle, in which Zhao forces are led by Lian Po and Zhao Kuo, while Qin is led by Wang He and Bai Qi, takes place near modern-day Gaoping in Shanxi and hundreds of thousands of soldiers from Zhao are executed after the battle.", + "granularity": "year", + "id": "", + }, Object { "categories": Object { "first": "By place", @@ -199,6 +219,15 @@ Object { "granularity": "year", "id": "", }, + ], +} +` + +exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-3 1`] = ` +Object { + "count": 2357, + "elapsed": 0n, + "hits": Array [ Object { "categories": Object { "first": "By place", @@ -219,15 +248,6 @@ Object { "granularity": "year", "id": "", }, - ], -} -` - -exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-3 1`] = ` -Object { - "count": 2240, - "elapsed": 0n, - "hits": Array [ Object { "categories": Object { "first": "By place", @@ -308,26 +328,6 @@ Object { "granularity": "year", "id": "", }, - Object { - "categories": Object { - "first": "By place", - "second": "Carthage", - }, - "date": "-239", - "description": "Concerned that Hamilcar Barca's leniency in pardoning those who he has captured who have participated in the Mercenary War will encourage others to defect, Mathos and Spendius order the mutilation and execution of ampquotabout seven hundredampquot Carthaginian prisoners, including Gesco. With the mercenaries jointly guilty of these atrocities, defectors dare not face Carthaginian justice under Hamilcar.", - "granularity": "year", - "id": "", - }, - Object { - "categories": Object { - "first": "By place", - "second": "Carthage", - }, - "date": "-238", - "description": "The Carthaginian armies besiege and capture Utica and Hippacritae. This ends the Carthaginian civil war.", - "granularity": "year", - "id": "", - }, ], } ` diff --git a/tap-snapshots/tests/tokenizer.test.ts.test.cjs b/tap-snapshots/tests/tokenizer.test.ts.test.cjs index 5d86835c7..300595ac4 100644 --- a/tap-snapshots/tests/tokenizer.test.ts.test.cjs +++ b/tap-snapshots/tests/tokenizer.test.ts.test.cjs @@ -25,41 +25,33 @@ Array [ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english > Should tokenize and stem correctly in english-O1 1`] = ` Array [ - "the", "quick", "brown", "fox", - "jumps", - "over", - "lazy", + "jump", + "lazi", "dog", ] ` exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english > Should tokenize and stem correctly in english-O2 1`] = ` Array [ - "i", - "baked", - "some", - "cakes", + "bake", + "cake", ] ` exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O1 1`] = ` Array [ - "this", - "is", "a", "test", - "with", - "duplicates", + "duplic", ] ` exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O2 1`] = ` Array [ - "it's", - "alive", + "aliv", ] ` diff --git a/tests/lyra.dataset.test.ts b/tests/lyra.dataset.test.ts index 40c409540..f06c79712 100644 --- a/tests/lyra.dataset.test.ts +++ b/tests/lyra.dataset.test.ts @@ -1,5 +1,5 @@ import t from "tap"; -import { create, insert, remove, search } from "../src/lyra"; +import { create, insert, insertBatch, remove, search } from "../src/lyra"; import type { PropertiesSchema, SearchResult } from "../src/lyra"; import dataset from "./datasets/events.json"; @@ -31,37 +31,19 @@ const db = create({ t.test("lyra.dataset", async t => { t.plan(3); - t.before(() => { + t.before(async () => { // eslint-disable-next-line @typescript-eslint/no-explicit-any - const events = (dataset as any).result.events; - - let i = 0; - return new Promise(resolve => { - function insertBatch() { - const batch = events.slice(i * 1000, (i + 1) * 1000); - i++; - - if (!batch.length) { - return resolve(); - } - - for (const event of batch) { - insert(db, { - date: event.date, - description: event.description, - granularity: event.granularity, - categories: { - first: event.category1 ?? "", - second: event.category2 ?? "", - }, - }); - } - - setImmediate(insertBatch); - } - - setImmediate(insertBatch); - }); + const events = (dataset as any).result.events.map((ev: any) => ({ + date: ev.date, + description: ev.description, + granularity: ev.granularity, + categories: { + first: ev.category1 ?? "", + second: ev.category2 ?? "", + }, + })); + + await insertBatch(db, events); }); t.test("should correctly populate the database with a large dataset", t => { @@ -152,8 +134,8 @@ t.test("lyra.dataset", async t => { t.matchSnapshot(s2, `${t.name}-page-2`); t.matchSnapshot(s3, `${t.name}-page-3`); - t.equal(s4.count, 2240); - t.equal(s5.hits.length, 1); + t.equal(s4.count, 2357); + t.equal(s5.hits.length, 10); }); t.test("should correctly delete documents", t => { @@ -179,6 +161,6 @@ t.test("lyra.dataset", async t => { offset: 0, }); - t.equal(newSearch.count, 2230); + t.equal(newSearch.count, 2347); }); }); diff --git a/tests/lyra.test.ts b/tests/lyra.test.ts index 0f440ac53..e08b06370 100644 --- a/tests/lyra.test.ts +++ b/tests/lyra.test.ts @@ -132,8 +132,8 @@ t.test("lyra", t => { const result1 = search(db, { term: "fox", exact: true }); const result2 = search(db, { term: "dog", exact: true }); - t.equal(result1.count, 1); - t.equal(result2.count, 2); + t.equal(result1.count, 2); + t.equal(result2.count, 3); // Prefix search const result3 = search(db, { term: "fox", exact: false }); @@ -146,7 +146,7 @@ t.test("lyra", t => { const result5 = search(db, { term: "fx", tolerance: 1 }); const result6 = search(db, { term: "dg", tolerance: 2 }); - t.equal(result5.count, 1); + t.equal(result5.count, 2); t.equal(result6.count, 4); }); diff --git a/tests/stemmer.en.test.ts b/tests/stemmer.en.test.ts new file mode 100644 index 000000000..4e26f50df --- /dev/null +++ b/tests/stemmer.en.test.ts @@ -0,0 +1,25 @@ +import t from "tap"; +import { stem } from "../src/tokenizer/stemmer/en"; + +t.test("ensligh stemmer", t => { + t.plan(1); + + t.test("should correctly stem words", t => { + t.plan(14); + + t.equal(stem("cats"), "cat"); + t.equal(stem("cars"), "car"); + t.equal(stem("beautiful"), "beauti"); + t.equal(stem("compressing"), "compress"); + t.equal(stem("inception"), "incep"); + t.equal(stem("searching"), "search"); + t.equal(stem("outragious"), "outragi"); + t.equal(stem("yelling"), "yell"); + t.equal(stem("overseed"), "overse"); + t.equal(stem("hopefully"), "hopefulli"); + t.equal(stem("mindfullness"), "mindful"); + t.equal(stem("mindfullness"), "mindful"); + t.equal(stem("chill"), "chill"); + t.equal(stem("rational"), "ration"); + }); +}); diff --git a/tests/tokenizer.test.ts b/tests/tokenizer.test.ts index 4a2d5ed6e..eaf9f26c3 100644 --- a/tests/tokenizer.test.ts +++ b/tests/tokenizer.test.ts @@ -1,5 +1,6 @@ import t from "tap"; -import { tokenize } from "../src/tokenizer"; +import { create } from "../src/lyra"; +import { tokenize, normalizationCache } from "../src/tokenizer"; t.test("Tokenizer", t => { t.plan(10); @@ -127,10 +128,129 @@ t.test("Tokenizer", t => { const I1 = "de kleine koeien"; const I2 = "Ik heb wat taarten gemaakt"; - const O1 = tokenize(I1, "dutch"); const O2 = tokenize(I2, "dutch"); + const O1 = tokenize(I1, "dutch"); t.matchSnapshot(O1, `${t.name}-O1`); t.matchSnapshot(O2, `${t.name}-O2`); }); }); + +t.test("Custom stop-words rules", t => { + t.plan(5); + + t.test("custom array of stop-words", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + customStopWords: ["quick", "brown", "fox", "dog"], + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["the", "jump", "over", "lazi"]); + t.same(O2, ["i", "bake", "some", "cake"]); + }); + + t.test("custom stop-words function", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + customStopWords(words: string[]): string[] { + return [...words, "quick", "brown", "fox", "dog"]; + }, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["jump", "lazi"]); + t.same(O2, ["bake", "cake"]); + }); + + t.test("disable stop-words", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + enableStopWords: false, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["the", "quick", "brown", "fox", "jump", "over", "lazi", "dog"]); + t.same(O2, ["i", "bake", "some", "cake"]); + }); + + t.test("disable stemming", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + enableStemming: false, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["quick", "brown", "fox", "jumps", "lazy", "dog"]); + t.same(O2, ["baked", "cakes"]); + }); + + t.test("custom stemming function", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + stemmingFn: word => `${word}-ish`, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + console.log(O1); + console.log(O2); + + t.same(O1, ["quick-ish", "brown-ish", "fox-ish", "jumps-ish", "lazy-ish", "dog-ish"]); + t.same(O2, ["baked-ish", "cakes-ish"]); + }); +});