From 1a48af050912751a33dfa793103a6c9f6a26e6ee Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Sun, 28 Aug 2022 19:33:56 +0200 Subject: [PATCH 1/4] feat(stemmer): adds english stemmer --- src/tokenizer/index.ts | 24 ++- src/tokenizer/stemmer/en.ts | 189 ++++++++++++++++++ src/tokenizer/stemmer/index.ts | 14 ++ .../tests/lyra.dataset.test.ts.test.cjs | 78 ++++---- .../tests/tokenizer.test.ts.test.cjs | 16 +- tests/lyra.dataset.test.ts | 50 ++--- tests/lyra.test.ts | 6 +- tests/stemmer.en.test.ts | 25 +++ 8 files changed, 317 insertions(+), 85 deletions(-) create mode 100644 src/tokenizer/stemmer/en.ts create mode 100644 src/tokenizer/stemmer/index.ts create mode 100644 tests/stemmer.en.test.ts diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts index 875d4536e..4d8d8ab71 100644 --- a/src/tokenizer/index.ts +++ b/src/tokenizer/index.ts @@ -1,5 +1,6 @@ import { Language } from "./languages"; import { replaceDiacritics } from "./diacritics"; +import { availableStemmers, stemmers } from "./stemmer"; const splitRegex: Record = { dutch: /[^a-z0-9_'-]+/gim, @@ -13,6 +14,23 @@ const splitRegex: Record = { swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim, }; +const normalizationCache = new Map(); + +function normalizeToken(token: string, language: Language): string { + const key = `${language}-${token}`; + if (normalizationCache.has(key)) { + return normalizationCache.get(key)!; + } else { + if (availableStemmers.includes(language)) { + token = stemmers[language]!(token); + } + + token = replaceDiacritics(token); + normalizationCache.set(key, token); + return token; + } +} + export function tokenize(input: string, language: Language = "english", allowDuplicates = false) { /* c8 ignore next 3 */ if (typeof input !== "string") { @@ -20,7 +38,11 @@ export function tokenize(input: string, language: Language = "english", allowDup } const splitRule = splitRegex[language]; - const tokens = input.toLowerCase().split(splitRule).map(replaceDiacritics); + const tokens = input + .toLowerCase() + .split(splitRule) + .map(token => normalizeToken(token, language)); + const trimTokens = trim(tokens); if (!allowDuplicates) { diff --git a/src/tokenizer/stemmer/en.ts b/src/tokenizer/stemmer/en.ts new file mode 100644 index 000000000..6157aa350 --- /dev/null +++ b/src/tokenizer/stemmer/en.ts @@ -0,0 +1,189 @@ +const step2List = { + ational: "ate", + tional: "tion", + enci: "ence", + anci: "ance", + izer: "ize", + bli: "ble", + alli: "al", + entli: "ent", + eli: "e", + ousli: "ous", + ization: "ize", + ation: "ate", + ator: "ate", + alism: "al", + iveness: "ive", + fulness: "ful", + ousness: "ous", + aliti: "al", + iviti: "ive", + biliti: "ble", + logi: "log", +}; + +const step3List = { + icate: "ic", + ative: "", + alize: "al", + iciti: "ic", + ical: "ic", + ful: "", + ness: "", +}; + +// Consonant +const c = "[^aeiou]"; +// Vowel +const v = "[aeiouy]"; +// Consonant sequence +const C = c + "[^aeiouy]*"; +// Vowel sequence +const V = v + "[aeiou]*"; + +// [C]VC... is m>0 +const mgr0 = "^(" + C + ")?" + V + C; +// [C]VC[V] is m=1 +const meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; +// [C]VCVC... is m>1 +const mgr1 = "^(" + C + ")?" + V + C + V + C; +// vowel in stem +const s_v = "^(" + C + ")?" + v; + +export function stem(w: string): string { + let stem: string; + let suffix: string; + let re: RegExp; + let re2: RegExp; + let re3: RegExp; + let re4: RegExp; + + if (w.length < 3) { + return w; + } + + const firstch = w.substring(0, 1); + if (firstch == "y") { + w = firstch.toUpperCase() + w.substring(1); + } + + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) { + w = w.replace(re, "$1$2"); + } else if (re2.test(w)) { + w = w.replace(re2, "$1$2"); + } + + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re, ""); + } + } else if (re2.test(w)) { + const fp = re2.exec(w)!; + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) { + w = w + "e"; + } else if (re3.test(w)) { + re = /.$/; + w = w.replace(re, ""); + } else if (re4.test(w)) { + w = w + "e"; + } + } + } + + re = /^(.+?)y$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + re = new RegExp(s_v); + if (stem && re.test(stem)) { + w = stem + "i"; + } + } + + re = + /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + suffix = fp?.[2]; + re = new RegExp(mgr0); + if (stem && re.test(stem)) { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + w = stem + step2List[suffix]; + } + } + + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + suffix = fp?.[2]; + re = new RegExp(mgr0); + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + if (stem && re.test(stem)) { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + w = stem + step3List[suffix]; + } + } + + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + re = new RegExp(mgr1); + if (stem && re.test(stem)) { + w = stem; + } + } else if (re2.test(w)) { + const fp = re2.exec(w)!; + stem = fp?.[1] ?? "" + fp?.[2] ?? ""; + re2 = new RegExp(mgr1); + if (re2.test(stem)) { + w = stem; + } + } + + re = /^(.+?)e$/; + if (re.test(w)) { + const fp = re.exec(w)!; + stem = fp?.[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (stem && (re.test(stem) || (re2.test(stem) && !re3.test(stem)))) { + w = stem; + } + } + + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re, ""); + } + + if (firstch == "y") { + w = firstch.toLowerCase() + w.substring(1); + } + + return w; +} diff --git a/src/tokenizer/stemmer/index.ts b/src/tokenizer/stemmer/index.ts new file mode 100644 index 000000000..0895901d2 --- /dev/null +++ b/src/tokenizer/stemmer/index.ts @@ -0,0 +1,14 @@ +import { stem as ENStemmer } from "./en"; +import { Language } from "../languages"; + +type Stemmer = (word: string) => string; + +type StemmerMap = { + [key in Language]: Stemmer; +}; + +export const stemmers: Partial = { + english: ENStemmer, +}; + +export const availableStemmers = Object.keys(stemmers); diff --git a/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs b/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs index e47566f9d..188c48d18 100644 --- a/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs +++ b/tap-snapshots/tests/lyra.dataset.test.ts.test.cjs @@ -7,7 +7,7 @@ 'use strict' exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-1 1`] = ` Object { - "count": 2240, + "count": 2357, "elapsed": 0n, "hits": Array [ Object { @@ -80,6 +80,16 @@ Object { "granularity": "year", "id": "", }, + Object { + "categories": Object { + "first": "By place", + "second": "Egypt", + }, + "date": "-276", + "description": "The first of the Syrian Wars starts between Egypt's Ptolemy II and Seleucid emperor Antiochus I Soter. The Egyptians invade northern Syria, but Antiochus defeats and repels his opponent's army.", + "granularity": "year", + "id": "", + }, Object { "categories": Object { "first": "By place", @@ -100,6 +110,15 @@ Object { "granularity": "year", "id": "", }, + ], +} +` + +exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-2 1`] = ` +Object { + "count": 2357, + "elapsed": 0n, + "hits": Array [ Object { "categories": Object { "first": "By place", @@ -110,15 +129,6 @@ Object { "granularity": "year", "id": "", }, - ], -} -` - -exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-2 1`] = ` -Object { - "count": 2240, - "elapsed": 0n, - "hits": Array [ Object { "categories": Object { "first": "By place", @@ -129,6 +139,16 @@ Object { "granularity": "year", "id": "", }, + Object { + "categories": Object { + "first": "By place", + "second": "China", + }, + "date": "-260", + "description": "In the Battle of Changping, the army of the Qin state routs the army of Zhao, establishing its military superiority over all other Chinese states during the Warring States Period. The battle, in which Zhao forces are led by Lian Po and Zhao Kuo, while Qin is led by Wang He and Bai Qi, takes place near modern-day Gaoping in Shanxi and hundreds of thousands of soldiers from Zhao are executed after the battle.", + "granularity": "year", + "id": "", + }, Object { "categories": Object { "first": "By place", @@ -199,6 +219,15 @@ Object { "granularity": "year", "id": "", }, + ], +} +` + +exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-3 1`] = ` +Object { + "count": 2357, + "elapsed": 0n, + "hits": Array [ Object { "categories": Object { "first": "By place", @@ -219,15 +248,6 @@ Object { "granularity": "year", "id": "", }, - ], -} -` - -exports[`tests/lyra.dataset.test.ts TAP lyra.dataset should perform paginate search > should perform paginate search-page-3 1`] = ` -Object { - "count": 2240, - "elapsed": 0n, - "hits": Array [ Object { "categories": Object { "first": "By place", @@ -308,26 +328,6 @@ Object { "granularity": "year", "id": "", }, - Object { - "categories": Object { - "first": "By place", - "second": "Carthage", - }, - "date": "-239", - "description": "Concerned that Hamilcar Barca's leniency in pardoning those who he has captured who have participated in the Mercenary War will encourage others to defect, Mathos and Spendius order the mutilation and execution of ampquotabout seven hundredampquot Carthaginian prisoners, including Gesco. With the mercenaries jointly guilty of these atrocities, defectors dare not face Carthaginian justice under Hamilcar.", - "granularity": "year", - "id": "", - }, - Object { - "categories": Object { - "first": "By place", - "second": "Carthage", - }, - "date": "-238", - "description": "The Carthaginian armies besiege and capture Utica and Hippacritae. This ends the Carthaginian civil war.", - "granularity": "year", - "id": "", - }, ], } ` diff --git a/tap-snapshots/tests/tokenizer.test.ts.test.cjs b/tap-snapshots/tests/tokenizer.test.ts.test.cjs index 5d86835c7..d545b3bb6 100644 --- a/tap-snapshots/tests/tokenizer.test.ts.test.cjs +++ b/tap-snapshots/tests/tokenizer.test.ts.test.cjs @@ -29,9 +29,9 @@ Array [ "quick", "brown", "fox", - "jumps", + "jump", "over", - "lazy", + "lazi", "dog", ] ` @@ -39,27 +39,27 @@ Array [ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english > Should tokenize and stem correctly in english-O2 1`] = ` Array [ "i", - "baked", + "bake", "some", - "cakes", + "cake", ] ` exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O1 1`] = ` Array [ - "this", + "thi", "is", "a", "test", "with", - "duplicates", + "duplic", ] ` exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O2 1`] = ` Array [ - "it's", - "alive", + "it'", + "aliv", ] ` diff --git a/tests/lyra.dataset.test.ts b/tests/lyra.dataset.test.ts index 40c409540..f06c79712 100644 --- a/tests/lyra.dataset.test.ts +++ b/tests/lyra.dataset.test.ts @@ -1,5 +1,5 @@ import t from "tap"; -import { create, insert, remove, search } from "../src/lyra"; +import { create, insert, insertBatch, remove, search } from "../src/lyra"; import type { PropertiesSchema, SearchResult } from "../src/lyra"; import dataset from "./datasets/events.json"; @@ -31,37 +31,19 @@ const db = create({ t.test("lyra.dataset", async t => { t.plan(3); - t.before(() => { + t.before(async () => { // eslint-disable-next-line @typescript-eslint/no-explicit-any - const events = (dataset as any).result.events; - - let i = 0; - return new Promise(resolve => { - function insertBatch() { - const batch = events.slice(i * 1000, (i + 1) * 1000); - i++; - - if (!batch.length) { - return resolve(); - } - - for (const event of batch) { - insert(db, { - date: event.date, - description: event.description, - granularity: event.granularity, - categories: { - first: event.category1 ?? "", - second: event.category2 ?? "", - }, - }); - } - - setImmediate(insertBatch); - } - - setImmediate(insertBatch); - }); + const events = (dataset as any).result.events.map((ev: any) => ({ + date: ev.date, + description: ev.description, + granularity: ev.granularity, + categories: { + first: ev.category1 ?? "", + second: ev.category2 ?? "", + }, + })); + + await insertBatch(db, events); }); t.test("should correctly populate the database with a large dataset", t => { @@ -152,8 +134,8 @@ t.test("lyra.dataset", async t => { t.matchSnapshot(s2, `${t.name}-page-2`); t.matchSnapshot(s3, `${t.name}-page-3`); - t.equal(s4.count, 2240); - t.equal(s5.hits.length, 1); + t.equal(s4.count, 2357); + t.equal(s5.hits.length, 10); }); t.test("should correctly delete documents", t => { @@ -179,6 +161,6 @@ t.test("lyra.dataset", async t => { offset: 0, }); - t.equal(newSearch.count, 2230); + t.equal(newSearch.count, 2347); }); }); diff --git a/tests/lyra.test.ts b/tests/lyra.test.ts index 0f440ac53..e08b06370 100644 --- a/tests/lyra.test.ts +++ b/tests/lyra.test.ts @@ -132,8 +132,8 @@ t.test("lyra", t => { const result1 = search(db, { term: "fox", exact: true }); const result2 = search(db, { term: "dog", exact: true }); - t.equal(result1.count, 1); - t.equal(result2.count, 2); + t.equal(result1.count, 2); + t.equal(result2.count, 3); // Prefix search const result3 = search(db, { term: "fox", exact: false }); @@ -146,7 +146,7 @@ t.test("lyra", t => { const result5 = search(db, { term: "fx", tolerance: 1 }); const result6 = search(db, { term: "dg", tolerance: 2 }); - t.equal(result5.count, 1); + t.equal(result5.count, 2); t.equal(result6.count, 4); }); diff --git a/tests/stemmer.en.test.ts b/tests/stemmer.en.test.ts new file mode 100644 index 000000000..4e26f50df --- /dev/null +++ b/tests/stemmer.en.test.ts @@ -0,0 +1,25 @@ +import t from "tap"; +import { stem } from "../src/tokenizer/stemmer/en"; + +t.test("ensligh stemmer", t => { + t.plan(1); + + t.test("should correctly stem words", t => { + t.plan(14); + + t.equal(stem("cats"), "cat"); + t.equal(stem("cars"), "car"); + t.equal(stem("beautiful"), "beauti"); + t.equal(stem("compressing"), "compress"); + t.equal(stem("inception"), "incep"); + t.equal(stem("searching"), "search"); + t.equal(stem("outragious"), "outragi"); + t.equal(stem("yelling"), "yell"); + t.equal(stem("overseed"), "overse"); + t.equal(stem("hopefully"), "hopefulli"); + t.equal(stem("mindfullness"), "mindful"); + t.equal(stem("mindfullness"), "mindful"); + t.equal(stem("chill"), "chill"); + t.equal(stem("rational"), "ration"); + }); +}); From 680e13d74a53b32c41898bae486340bc5019ba82 Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Mon, 29 Aug 2022 10:06:33 +0200 Subject: [PATCH 2/4] feat(tokenizer): adds english stop-words list --- src/tokenizer/index.ts | 11 + src/tokenizer/stemmer/index.ts | 2 +- src/tokenizer/stop-words/en.ts | 204 ++++++++++++++++++ src/tokenizer/stop-words/index.ts | 12 ++ .../tests/tokenizer.test.ts.test.cjs | 12 +- 5 files changed, 232 insertions(+), 9 deletions(-) create mode 100644 src/tokenizer/stop-words/en.ts create mode 100644 src/tokenizer/stop-words/index.ts diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts index 4d8d8ab71..f6c0a2014 100644 --- a/src/tokenizer/index.ts +++ b/src/tokenizer/index.ts @@ -1,6 +1,7 @@ import { Language } from "./languages"; import { replaceDiacritics } from "./diacritics"; import { availableStemmers, stemmers } from "./stemmer"; +import { availableStopWords, stopWords } from "./stop-words"; const splitRegex: Record = { dutch: /[^a-z0-9_'-]+/gim, @@ -21,6 +22,16 @@ function normalizeToken(token: string, language: Language): string { if (normalizationCache.has(key)) { return normalizationCache.get(key)!; } else { + // Remove stop-words + if (availableStopWords.includes(language)) { + if (stopWords[language]!.includes(token)) { + const token = ""; + normalizationCache.set(key, token); + return token; + } + } + + // Stem token if (availableStemmers.includes(language)) { token = stemmers[language]!(token); } diff --git a/src/tokenizer/stemmer/index.ts b/src/tokenizer/stemmer/index.ts index 0895901d2..59faf33bd 100644 --- a/src/tokenizer/stemmer/index.ts +++ b/src/tokenizer/stemmer/index.ts @@ -1,5 +1,5 @@ +import type { Language } from "../languages"; import { stem as ENStemmer } from "./en"; -import { Language } from "../languages"; type Stemmer = (word: string) => string; diff --git a/src/tokenizer/stop-words/en.ts b/src/tokenizer/stop-words/en.ts new file mode 100644 index 000000000..cced16a13 --- /dev/null +++ b/src/tokenizer/stop-words/en.ts @@ -0,0 +1,204 @@ +export const en = [ + "i", + "me", + "my", + "myself", + "we", + "us", + "our", + "ours", + "ourselves", + + "you", + "your", + "yours", + "yourself", + "yourselves", + + "he", + "him", + "his", + "himself", + + "she", + "her", + "hers", + "herself", + "it", + "its", + "itself", + + "they", + "them", + "their", + "theirs", + "themselves", + + "what", + "which", + "who", + "whom", + "this", + "that", + "these", + "those", + + "am", + "is", + "are", + "was", + "were", + "be", + "been", + "being", + + "have", + "has", + "had", + "having", + + "do", + "does", + "did", + "doing", + + "will", + "would", + + "shall", + "should", + "can", + "could", + + "may", + "might", + "must", + "ought", + + "i'm", + "you're", + "he's", + "she's", + "it's", + "we're", + "they're", + "i've", + "you've", + "we've", + "they've", + "i'd", + "you'd", + "he'd", + "she'd", + "we'd", + "they'd", + "i'll", + "you'll", + "he'll", + "she'll", + "we'll", + "they'll", + + "isn't", + "aren't", + "wasn't", + "weren't", + "hasn't", + "haven't", + "hadn't", + "doesn't", + "don't", + "didn't", + + "won't", + "wouldn't", + "shan't", + "shouldn't", + "can't", + "cannot", + "couldn't", + "mustn't", + + "let's", + "that's", + "who's", + "what's", + "here's", + "there's", + "when's", + "where's", + "why's", + "how's", + + "an", + "the", + + "and", + "but", + "if", + "or", + "because", + "as", + "until", + "while", + + "of", + "at", + "by", + "for", + "with", + "about", + "against", + "between", + "into", + "through", + "during", + "before", + "after", + "above", + "below", + "to", + "from", + "up", + "down", + "in", + "out", + "on", + "off", + "over", + "under", + + "again", + "further", + "then", + "once", + + "here", + "there", + "when", + "where", + "why", + "how", + + "all", + "any", + "both", + "each", + "few", + "more", + "most", + "other", + "some", + "such", + + "no", + "nor", + "not", + "only", + "own", + "same", + "so", + "than", + "too", + "very", +]; diff --git a/src/tokenizer/stop-words/index.ts b/src/tokenizer/stop-words/index.ts new file mode 100644 index 000000000..54d1464fc --- /dev/null +++ b/src/tokenizer/stop-words/index.ts @@ -0,0 +1,12 @@ +import type { Language } from "../languages"; +import { en } from "./en"; + +type StopWordsMap = { + [key in Language]: string[]; +}; + +export const stopWords: Partial = { + english: en, +}; + +export const availableStopWords = Object.keys(stopWords); diff --git a/tap-snapshots/tests/tokenizer.test.ts.test.cjs b/tap-snapshots/tests/tokenizer.test.ts.test.cjs index d545b3bb6..d8c9318bf 100644 --- a/tap-snapshots/tests/tokenizer.test.ts.test.cjs +++ b/tap-snapshots/tests/tokenizer.test.ts.test.cjs @@ -25,12 +25,11 @@ Array [ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english > Should tokenize and stem correctly in english-O1 1`] = ` Array [ - "the", "quick", "brown", "fox", "jump", - "over", + "", "lazi", "dog", ] @@ -38,28 +37,25 @@ Array [ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english > Should tokenize and stem correctly in english-O2 1`] = ` Array [ - "i", "bake", - "some", + "", "cake", ] ` exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O1 1`] = ` Array [ - "thi", - "is", "a", "test", - "with", + "", "duplic", ] ` exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O2 1`] = ` Array [ - "it'", "aliv", + "", ] ` From 6d709b39de8fe79a0f317c5e6889b0bb95c5f637 Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Mon, 29 Aug 2022 11:32:38 +0200 Subject: [PATCH 3/4] feat(tokenizer): adds ability to disable/enable stemming and stop-words --- src/errors.ts | 12 +++++ src/lyra.ts | 95 ++++++++++++++++++++++++++++++++-- src/tokenizer/index.ts | 35 ++++++++----- src/tokenizer/stemmer/index.ts | 2 +- tests/tokenizer.test.ts | 2 +- 5 files changed, 127 insertions(+), 19 deletions(-) diff --git a/src/errors.ts b/src/errors.ts index c75eda439..a52fd867d 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -47,3 +47,15 @@ export function GETTER_SETTER_WORKS_ON_EDGE_ONLY(method: string): string { export function RESERVED_PROPERTY_NAME(name: string): string { return `"${name}" is a reserved property name. Please change it to "__${name}", "${name}__", "_${name}_", or similar.`; } + +export function CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY(): string { + return `Custom stop words array must only contain strings.`; +} + +export function CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY(): string { + return `Custom stop words must be a function or an array of strings.`; +} + +export function INVALID_STEMMER_FUNCTION_TYPE(): string { + return `tokenizer.stemmingFn property must be a function.`; +} diff --git a/src/lyra.ts b/src/lyra.ts index 8762720cc..cb10a59c0 100644 --- a/src/lyra.ts +++ b/src/lyra.ts @@ -3,9 +3,11 @@ import { tokenize } from "./tokenizer"; import { getNanosecondsTime, uniqueId, reservedPropertyNames } from "./utils"; import { Language, SUPPORTED_LANGUAGES } from "./tokenizer/languages"; import type { ResolveSchema, SearchProperties } from "./types"; +import { availableStemmers, Stemmer, stemmers } from "./tokenizer/stemmer"; import { create as createNode, Node } from "./prefix-tree/node"; import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie"; import { trackInsertion } from "./insertion-checker"; +import { availableStopWords, stopWords } from "./tokenizer/stop-words"; type Index = Record; @@ -18,6 +20,20 @@ export type PropertiesSchema = { [key: string]: PropertyType | PropertiesSchema; }; +export type TokenizerConfig = { + enableStemming?: boolean; + enableStopWords?: boolean; + customStopWords?: ((stopWords: string[]) => string[]) | string[]; + stemmingFn?: (word: string) => string; +}; + +export type TokenizerConfigExec = { + enableStemming: boolean; + enableStopWords: boolean; + customStopWords: string[]; + stemmingFn?: (word: string) => string; +}; + export type Configuration = { /** * The structure of the document to be inserted into the database. @@ -28,6 +44,7 @@ export type Configuration = { */ defaultLanguage?: Language; edge?: boolean; + tokenizer?: TokenizerConfig; }; export type Data = { @@ -41,6 +58,7 @@ export interface Lyra extends Data { defaultLanguage: Language; schema: S; edge: boolean; + tokenizer?: TokenizerConfig; } export type InsertConfig = { @@ -152,12 +170,13 @@ function recursiveTrieInsertion( id: string, config: InsertConfig, prefix = "", + tokenizerConfig: TokenizerConfig, ) { for (const key of Object.keys(doc)) { const isNested = typeof doc[key] === "object"; const propName = `${prefix}${key}`; if (isNested) { - recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema, id, config, propName + "."); + recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema, id, config, propName + ".", tokenizerConfig); return; } @@ -166,7 +185,7 @@ function recursiveTrieInsertion( // Use propName here because if doc is a nested object // We will get the wrong index const requestedTrie = index[propName]; - const tokens = tokenize(doc[key] as string, config.language); + const tokens = tokenize(doc[key] as string, config.language, false, tokenizerConfig); for (const token of tokens) { trieInsert(nodes, requestedTrie, token, id); @@ -247,6 +266,7 @@ export function create(properties: Configuration) nodes: {}, index: {}, edge: properties.edge ?? false, + tokenizer: defaultTokenizerConfig(defaultLanguage, properties.tokenizer!), }; buildIndex(instance, properties.schema); @@ -282,7 +302,7 @@ export function insert( } lyra.docs[id] = doc; - recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config); + recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer!); trackInsertion(lyra); return { id }; @@ -358,7 +378,7 @@ export function remove(lyra: Lyra, docID: string) if (propertyType === "string") { const idx = lyra.index[key]; - const tokens = tokenize(document[key] as string); + const tokens = tokenize(document[key] as string, lyra.defaultLanguage, false, lyra.tokenizer!); for (const token of tokens) { if (token && removeDocumentByWord(lyra.nodes, idx, token, docID)) { @@ -394,7 +414,7 @@ export function search( language = lyra.defaultLanguage; } - const tokens = tokenize(params.term, language); + const tokens = tokenize(params.term, language, false, lyra.tokenizer!); const indices = getIndices(lyra, params.properties); const uniqueDocIds = new Set(); const { limit = 10, offset = 0, exact = false } = params; @@ -466,3 +486,68 @@ export function load(lyra: Lyra, { index, docs, n lyra.nodes = nodes; lyra.schema = schema; } + +export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec { + let defaultStopWords: string[]; + let defaultStemmingFn: Stemmer | undefined; + + // Enable custom stemming function + if (tokenizerConfig?.stemmingFn) { + if (typeof tokenizerConfig.stemmingFn !== "function") { + defaultStemmingFn = tokenizerConfig?.stemmingFn; + } else { + throw Error(ERRORS.INVALID_STEMMER_FUNCTION_TYPE()); + } + } else { + if (availableStemmers.includes(language)) { + defaultStemmingFn = stemmers[language]!; + } else { + defaultStemmingFn = undefined; + } + } + + // Enable default stop-words + if (availableStopWords.includes(language)) { + defaultStopWords = stopWords[language]!; + } else { + defaultStopWords = []; + } + + // Enable custom stop-words + let customStopWords: string[] | undefined; + + if (tokenizerConfig?.customStopWords) { + switch (typeof tokenizerConfig.customStopWords) { + // Execute the custom step-words function. + // This will pass the default step-words for a given language as a first parameter. + case "function": + customStopWords = tokenizerConfig.customStopWords(defaultStopWords); + break; + + // Check if the custom step-words is an array. + // If it's an object, throw an exception. If the array contains any non-string value, throw an exception. + case "object": + if (Array.isArray(tokenizerConfig.customStopWords)) { + if ((tokenizerConfig.customStopWords as string[]).some((x: unknown) => typeof x !== "string")) { + throw Error(ERRORS.CUSTOM_STOP_WORDS_ARRAY_MUST_BE_STRING_ARRAY()); + } else { + customStopWords = tokenizerConfig.customStopWords as string[]; + } + } else { + throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY()); + } + break; + + // By default, throw an exception, as this is a misconfiguration. + default: + throw Error(ERRORS.CUSTOM_STOP_WORDS_MUST_BE_FUNCTION_OR_ARRAY()); + } + } + + return { + enableStopWords: tokenizerConfig?.enableStopWords ?? true, + enableStemming: tokenizerConfig?.enableStemming ?? true, + stemmingFn: defaultStemmingFn, + customStopWords: customStopWords ?? defaultStopWords, + }; +} diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts index f6c0a2014..b23820b6f 100644 --- a/src/tokenizer/index.ts +++ b/src/tokenizer/index.ts @@ -1,7 +1,8 @@ -import { Language } from "./languages"; +import type { Language } from "./languages"; +import type { TokenizerConfig } from "../lyra"; +import { defaultTokenizerConfig } from "../lyra"; import { replaceDiacritics } from "./diacritics"; -import { availableStemmers, stemmers } from "./stemmer"; -import { availableStopWords, stopWords } from "./stop-words"; +import { stemmers } from "./stemmer"; const splitRegex: Record = { dutch: /[^a-z0-9_'-]+/gim, @@ -17,23 +18,28 @@ const splitRegex: Record = { const normalizationCache = new Map(); -function normalizeToken(token: string, language: Language): string { +function normalizeToken(token: string, language: Language, tokenizerConfig: TokenizerConfig): string { const key = `${language}-${token}`; + if (normalizationCache.has(key)) { return normalizationCache.get(key)!; } else { - // Remove stop-words - if (availableStopWords.includes(language)) { - if (stopWords[language]!.includes(token)) { + // Check if stop-words removal is enabled + if (tokenizerConfig?.enableStopWords) { + // Remove stop-words + if ((tokenizerConfig?.customStopWords as string[]).includes(token)) { const token = ""; normalizationCache.set(key, token); return token; } } - // Stem token - if (availableStemmers.includes(language)) { - token = stemmers[language]!(token); + // Check if stemming is enabled + if (tokenizerConfig?.enableStemming) { + // Stem token when a stemming function is available + if (typeof tokenizerConfig?.stemmingFn === "function") { + token = stemmers[language]!(token); + } } token = replaceDiacritics(token); @@ -42,7 +48,12 @@ function normalizeToken(token: string, language: Language): string { } } -export function tokenize(input: string, language: Language = "english", allowDuplicates = false) { +export function tokenize( + input: string, + language: Language = "english", + allowDuplicates = false, + tokenizerConfig: TokenizerConfig = defaultTokenizerConfig(language), +) { /* c8 ignore next 3 */ if (typeof input !== "string") { return [input]; @@ -52,7 +63,7 @@ export function tokenize(input: string, language: Language = "english", allowDup const tokens = input .toLowerCase() .split(splitRule) - .map(token => normalizeToken(token, language)); + .map(token => normalizeToken(token, language, tokenizerConfig!)); const trimTokens = trim(tokens); diff --git a/src/tokenizer/stemmer/index.ts b/src/tokenizer/stemmer/index.ts index 59faf33bd..0beef0b71 100644 --- a/src/tokenizer/stemmer/index.ts +++ b/src/tokenizer/stemmer/index.ts @@ -1,7 +1,7 @@ import type { Language } from "../languages"; import { stem as ENStemmer } from "./en"; -type Stemmer = (word: string) => string; +export type Stemmer = (word: string) => string; type StemmerMap = { [key in Language]: Stemmer; diff --git a/tests/tokenizer.test.ts b/tests/tokenizer.test.ts index 4a2d5ed6e..9e043f119 100644 --- a/tests/tokenizer.test.ts +++ b/tests/tokenizer.test.ts @@ -127,8 +127,8 @@ t.test("Tokenizer", t => { const I1 = "de kleine koeien"; const I2 = "Ik heb wat taarten gemaakt"; - const O1 = tokenize(I1, "dutch"); const O2 = tokenize(I2, "dutch"); + const O1 = tokenize(I1, "dutch"); t.matchSnapshot(O1, `${t.name}-O1`); t.matchSnapshot(O2, `${t.name}-O2`); From 9d0bd1ac88de4457da2e83070d1e8f1777716aaa Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Mon, 29 Aug 2022 12:20:14 +0200 Subject: [PATCH 4/4] test(tokenizer): adds tests for custom tokenizer configs --- src/lyra.ts | 4 +- src/tokenizer/index.ts | 9 +- .../tests/tokenizer.test.ts.test.cjs | 4 - tests/tokenizer.test.ts | 122 +++++++++++++++++- 4 files changed, 128 insertions(+), 11 deletions(-) diff --git a/src/lyra.ts b/src/lyra.ts index cb10a59c0..89e548685 100644 --- a/src/lyra.ts +++ b/src/lyra.ts @@ -493,8 +493,8 @@ export function defaultTokenizerConfig(language: Language, tokenizerConfig: Toke // Enable custom stemming function if (tokenizerConfig?.stemmingFn) { - if (typeof tokenizerConfig.stemmingFn !== "function") { - defaultStemmingFn = tokenizerConfig?.stemmingFn; + if (typeof tokenizerConfig.stemmingFn === "function") { + defaultStemmingFn = tokenizerConfig.stemmingFn; } else { throw Error(ERRORS.INVALID_STEMMER_FUNCTION_TYPE()); } diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts index b23820b6f..13d7f26af 100644 --- a/src/tokenizer/index.ts +++ b/src/tokenizer/index.ts @@ -16,10 +16,10 @@ const splitRegex: Record = { swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim, }; -const normalizationCache = new Map(); +export const normalizationCache = new Map(); function normalizeToken(token: string, language: Language, tokenizerConfig: TokenizerConfig): string { - const key = `${language}-${token}`; + const key = `${language}:${token}`; if (normalizationCache.has(key)) { return normalizationCache.get(key)!; @@ -38,7 +38,7 @@ function normalizeToken(token: string, language: Language, tokenizerConfig: Toke if (tokenizerConfig?.enableStemming) { // Stem token when a stemming function is available if (typeof tokenizerConfig?.stemmingFn === "function") { - token = stemmers[language]!(token); + token = tokenizerConfig?.stemmingFn(token); } } @@ -63,7 +63,8 @@ export function tokenize( const tokens = input .toLowerCase() .split(splitRule) - .map(token => normalizeToken(token, language, tokenizerConfig!)); + .map(token => normalizeToken(token, language, tokenizerConfig!)) + .filter(Boolean); const trimTokens = trim(tokens); diff --git a/tap-snapshots/tests/tokenizer.test.ts.test.cjs b/tap-snapshots/tests/tokenizer.test.ts.test.cjs index d8c9318bf..300595ac4 100644 --- a/tap-snapshots/tests/tokenizer.test.ts.test.cjs +++ b/tap-snapshots/tests/tokenizer.test.ts.test.cjs @@ -29,7 +29,6 @@ Array [ "brown", "fox", "jump", - "", "lazi", "dog", ] @@ -38,7 +37,6 @@ Array [ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english > Should tokenize and stem correctly in english-O2 1`] = ` Array [ "bake", - "", "cake", ] ` @@ -47,7 +45,6 @@ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctl Array [ "a", "test", - "", "duplic", ] ` @@ -55,7 +52,6 @@ Array [ exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O2 1`] = ` Array [ "aliv", - "", ] ` diff --git a/tests/tokenizer.test.ts b/tests/tokenizer.test.ts index 9e043f119..eaf9f26c3 100644 --- a/tests/tokenizer.test.ts +++ b/tests/tokenizer.test.ts @@ -1,5 +1,6 @@ import t from "tap"; -import { tokenize } from "../src/tokenizer"; +import { create } from "../src/lyra"; +import { tokenize, normalizationCache } from "../src/tokenizer"; t.test("Tokenizer", t => { t.plan(10); @@ -134,3 +135,122 @@ t.test("Tokenizer", t => { t.matchSnapshot(O2, `${t.name}-O2`); }); }); + +t.test("Custom stop-words rules", t => { + t.plan(5); + + t.test("custom array of stop-words", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + customStopWords: ["quick", "brown", "fox", "dog"], + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["the", "jump", "over", "lazi"]); + t.same(O2, ["i", "bake", "some", "cake"]); + }); + + t.test("custom stop-words function", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + customStopWords(words: string[]): string[] { + return [...words, "quick", "brown", "fox", "dog"]; + }, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["jump", "lazi"]); + t.same(O2, ["bake", "cake"]); + }); + + t.test("disable stop-words", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + enableStopWords: false, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["the", "quick", "brown", "fox", "jump", "over", "lazi", "dog"]); + t.same(O2, ["i", "bake", "some", "cake"]); + }); + + t.test("disable stemming", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + enableStemming: false, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + t.same(O1, ["quick", "brown", "fox", "jumps", "lazy", "dog"]); + t.same(O2, ["baked", "cakes"]); + }); + + t.test("custom stemming function", t => { + t.plan(2); + + const db = create({ + schema: {}, + tokenizer: { + stemmingFn: word => `${word}-ish`, + }, + }); + + normalizationCache.clear(); + + const I1 = "the quick brown fox jumps over the lazy dog"; + const I2 = "I baked some cakes"; + + const O1 = tokenize(I1, "english", false, db.tokenizer); + const O2 = tokenize(I2, "english", false, db.tokenizer); + + console.log(O1); + console.log(O2); + + t.same(O1, ["quick-ish", "brown-ish", "fox-ish", "jumps-ish", "lazy-ish", "dog-ish"]); + t.same(O2, ["baked-ish", "cakes-ish"]); + }); +});