Skip to content

Commit 3a93477

Browse files
authored
feat(full-text-search): english and german language support
* move language support in own packages * add german and english language support * optimize language class layout * add unit tests
1 parent 99f5661 commit 3a93477

File tree

14 files changed

+551
-134
lines changed

14 files changed

+551
-134
lines changed

config/build.js

+4-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@ const PACKAGES = [
1313
"local-storage",
1414
"indexed-storage",
1515
"fs-storage",
16-
"full-text-search"
16+
"full-text-search",
17+
"full-text-search-language",
18+
"full-text-search-language-de",
19+
"full-text-search-language-en",
1720
];
1821

1922
const ROOT_DIR = process.cwd();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"name": "@lokijs/full-text-search-language-de",
3+
"description": "A german language analyzer for the full-text-search.",
4+
"author": "Various authors",
5+
"license": "MIT",
6+
"repository": {
7+
"type": "git",
8+
"url": "https://github.com/LokiJS-Forge/LokiJS2.git"
9+
},
10+
"main": "lokijs.full-text-search-language-de.js",
11+
"dependencies": {
12+
"@lokijs/full-text-search": "0",
13+
"@lokijs/full-text-search-language": "0"
14+
}
15+
}
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import {DE} from "../../../src/language/de";
1+
import {DE} from "../../src/de";
2+
import {createLanguageTest, LanguageTestData} from "../../../full-text-search-language/spec/helper/create_lanuage_test";
23

3-
export const de = {
4+
export const de: LanguageTestData = {
45
tokenizer: DE,
56
docs: [
67
"An Deutschland grenzen neun Nachbarländer und naturräumlich im Norden die Gewässer der Nord- und Ostsee, im Süden das Bergland der Alpen. Es liegt in der gemäßigten Klimazone, zählt mit rund 80 Millionen Einwohnern zu den dicht besiedelten Flächenstaaten und gilt international als das Land mit der dritthöchsten Zahl von Einwanderern. aufeinanderfolgenden. auffassen.",
@@ -9,30 +10,32 @@ export const de = {
910
tests: [{
1011
what: "find the word",
1112
search: "deutschland",
12-
found: [0, 1]
13+
expected: [0, 1]
1314
}, {
1415
what: "find the word",
1516
search: "urlaubsziel",
16-
found: [1]
17+
expected: [1]
1718
}, {
1819
what: "find the word",
1920
search: "gewass",
20-
found: [0]
21+
expected: [0]
2122
}, {
2223
what: "find the word",
2324
search: "verfugt",
24-
found: [1]
25+
expected: [1]
2526
}, {
2627
what: "never find a word that does not exist, like",
2728
search: "inexistent",
28-
found: []
29+
expected: []
2930
}, {
3031
what: "never find a stop word like",
3132
search: "und",
32-
found: []
33+
expected: []
3334
}, {
3435
what: "find a correctly stemmed word",
35-
search: "auffassung",
36-
found: [0]
36+
search: "auffass",
37+
expected: [0]
3738
}]
3839
};
40+
41+
createLanguageTest("de", de);

packages/full-text-search/src/language/de.js packages/full-text-search-language-de/src/de.ts

+41-33
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,23 @@
11
/*
22
* From MihaiValentin/lunr-languages.
3-
* Last update from 04/16/2017 - 19af41fb9bd644d9081ad274f96f700b21464290
3+
* Last update from 2017/04/16 - 19af41fb9bd644d9081ad274f96f700b21464290
44
*/
5-
import {generateTrimmer, generateStopWordFilter, Among, SnowballProgram} from "./support";
6-
import {Tokenizer} from "../tokenizer";
5+
import {Tokenizer} from "../../full-text-search/src/index";
6+
import {
7+
generateTrimmer,
8+
generateStopWordFilter,
9+
Among,
10+
SnowballProgram
11+
} from "../../full-text-search-language/src/language";
712

8-
let wordCharacters = "A-Za-z\xAA\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02B8\u02E0-\u02E4\u1D00-\u1D25\u1D2C-\u1D5C\u1D62-\u1D65\u1D6B-\u1D77\u1D79-\u1DBE\u1E00-\u1EFF\u2071\u207F\u2090-\u209C\u212A\u212B\u2132\u214E\u2160-\u2188\u2C60-\u2C7F\uA722-\uA787\uA78B-\uA7AD\uA7B0-\uA7B7\uA7F7-\uA7FF\uAB30-\uAB5A\uAB5C-\uAB64\uFB00-\uFB06\uFF21-\uFF3A\uFF41-\uFF5A";
9-
let trimmer = generateTrimmer(wordCharacters);
13+
class GermanStemmer {
14+
public getCurrent: () => string;
15+
public setCurrent: (word: string) => void;
16+
public stem: () => void;
1017

11-
let tkz = new Tokenizer();
12-
13-
tkz.add("trimmer-de", trimmer);
14-
15-
let stemmer = ((() => {
16-
/* create the wrapped stemmer object */
17-
let st = new (function GermanStemmer() {
18+
constructor() {
19+
// Write everything in the constructor to reduce code size and increase performance.
20+
// The original implementation uses a ES5 anonymous function class.
1821
let a_0 = [new Among("", -1, 6), new Among("U", 0, 2),
1922
new Among("Y", 0, 1), new Among("\u00E4", 0, 3),
2023
new Among("\u00F6", 0, 4), new Among("\u00FC", 0, 5)
@@ -53,16 +56,17 @@ let stemmer = ((() => {
5356
117, 30, 4
5457
];
5558

56-
let I_x;
57-
let I_p2;
58-
let I_p1;
59+
let I_x: number;
60+
let I_p2: number;
61+
let I_p1: number;
5962
let sbp = new SnowballProgram();
60-
this.setCurrent = (word) => {
63+
64+
this.setCurrent = (word: string) => {
6165
sbp.setCurrent(word);
6266
};
6367
this.getCurrent = () => sbp.getCurrent();
6468

65-
function habr1(c1, c2, v_1) {
69+
function habr1(c1: string, c2: string, v_1: number) {
6670
if (sbp.eq_s(1, c1)) {
6771
sbp.ket = sbp.cursor;
6872
if (sbp.in_grouping(g_v, 97, 252)) {
@@ -304,19 +308,12 @@ let stemmer = ((() => {
304308
r_standard_suffix();
305309
sbp.cursor = sbp.limit_backward;
306310
r_postlude();
307-
return true;
308311
};
309-
});
310-
311-
/* and return a function that stems a word for the current locale */
312-
return (token) => {
313-
st.setCurrent(token);
314-
st.stem();
315-
return st.getCurrent();
316-
};
317-
}))();
312+
}
313+
}
318314

319-
tkz.setSplitter("whitespace-splitter", function defaultSplitter(str) {
315+
// Split at whitespace and dashes.
316+
function splitter(str: string) {
320317
let trimmedTokens = [];
321318
let tokens = str.split(/[\s-]+/);
322319
for (let i = 0; i < tokens.length; i++) {
@@ -325,11 +322,22 @@ tkz.setSplitter("whitespace-splitter", function defaultSplitter(str) {
325322
}
326323
}
327324
return trimmedTokens;
328-
});
325+
}
326+
327+
const st = new GermanStemmer();
329328

330-
tkz.add("stemmer-de", stemmer);
329+
function stemmer(token: string) {
330+
st.setCurrent(token);
331+
st.stem();
332+
return st.getCurrent();
333+
}
331334

332-
let stopWordFilter = generateStopWordFilter(["aber", "alle", "allem", "allen", "aller", "alles", "als", "also", "am", "an", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "damit", "dann", "das", "dasselbe", "dazu", "daß", "dein", "deine", "deinem", "deinen", "deiner", "deines", "dem", "demselben", "den", "denn", "denselben", "der", "derer", "derselbe", "derselben", "des", "desselben", "dessen", "dich", "die", "dies", "diese", "dieselbe", "dieselben", "diesem", "diesen", "dieser", "dieses", "dir", "doch", "dort", "du", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "er", "es", "etwas", "euch", "euer", "eure", "eurem", "euren", "eurer", "eures", "für", "gegen", "gewesen", "hab", "habe", "haben", "hat", "hatte", "hatten", "hier", "hin", "hinter", "ich", "ihm", "ihn", "ihnen", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "in", "indem", "ins", "ist", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "können", "könnte", "machen", "man", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mich", "mir", "mit", "muss", "musste", "nach", "nicht", "nichts", "noch", "nun", "nur", "ob", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "sie", "sind", "so", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "um", "und", "uns", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "vom", "von", "vor", "war", "waren", "warst", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wollen", "wollte", "während", "würde", "würden", "zu", "zum", "zur", "zwar", "zwischen", "über"]);
333-
tkz.add("stopWordFilter-de", stopWordFilter);
335+
const trimmer = generateTrimmer("A-Za-z\xAA\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02B8\u02E0-\u02E4\u1D00-\u1D25\u1D2C-\u1D5C\u1D62-\u1D65\u1D6B-\u1D77\u1D79-\u1DBE\u1E00-\u1EFF\u2071\u207F\u2090-\u209C\u212A\u212B\u2132\u214E\u2160-\u2188\u2C60-\u2C7F\uA722-\uA787\uA78B-\uA7AD\uA7B0-\uA7B7\uA7F7-\uA7FF\uAB30-\uAB5A\uAB5C-\uAB64\uFB00-\uFB06\uFF21-\uFF3A\uFF41-\uFF5A");
336+
const stopWordFilter = generateStopWordFilter(["aber", "alle", "allem", "allen", "aller", "alles", "als", "also", "am", "an", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "damit", "dann", "das", "dasselbe", "dazu", "daß", "dein", "deine", "deinem", "deinen", "deiner", "deines", "dem", "demselben", "den", "denn", "denselben", "der", "derer", "derselbe", "derselben", "des", "desselben", "dessen", "dich", "die", "dies", "diese", "dieselbe", "dieselben", "diesem", "diesen", "dieser", "dieses", "dir", "doch", "dort", "du", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "er", "es", "etwas", "euch", "euer", "eure", "eurem", "euren", "eurer", "eures", "für", "gegen", "gewesen", "hab", "habe", "haben", "hat", "hatte", "hatten", "hier", "hin", "hinter", "ich", "ihm", "ihn", "ihnen", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "in", "indem", "ins", "ist", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "können", "könnte", "machen", "man", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mich", "mir", "mit", "muss", "musste", "nach", "nicht", "nichts", "noch", "nun", "nur", "ob", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "sie", "sind", "so", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "um", "und", "uns", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "vom", "von", "vor", "war", "waren", "warst", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wollen", "wollte", "während", "würde", "würden", "zu", "zum", "zur", "zwar", "zwischen", "über"]);
334337

335-
export {tkz as DE};
338+
// Create, configure and export the tokenizer.
339+
export const DE: Tokenizer = new Tokenizer();
340+
DE.setSplitter("whitespace-splitter", splitter);
341+
DE.add("trimmer-de", trimmer);
342+
DE.add("stemmer-de", stemmer);
343+
DE.add("stopWordFilter-de", stopWordFilter);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
/* global __dirname, module, require */
2+
const path = require("path");
3+
const webpackConigCreator = require('../../config/webpack-config-creator.js');
4+
5+
module.exports = webpackConigCreator({
6+
entry: path.join(__dirname, "src", "de.ts"),
7+
filename: "lokijs.full-text-search-language-de.js",
8+
library: "@lokijs/full-text-search-language-de",
9+
externals: {
10+
"../../full-text-search-language/src/language": "@lokijs/full-text-search-language",
11+
"../../full-text-search/src/index": "@lokijs/full-text-search"
12+
},
13+
});
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"name": "@lokijs/full-text-search-language-en",
3+
"description": "An English language analyzer for the full-text-search.",
4+
"author": "Various authors",
5+
"license": "MIT",
6+
"repository": {
7+
"type": "git",
8+
"url": "https://github.com/LokiJS-Forge/LokiJS2.git"
9+
},
10+
"main": "lokijs.full-text-search-language-en.js",
11+
"dependencies": {
12+
"@lokijs/full-text-search": "0",
13+
"@lokijs/full-text-search-language": "0"
14+
}
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import {EN} from "../../src/en";
2+
import {createLanguageTest, LanguageTestData} from "../../../full-text-search-language/spec/helper/create_lanuage_test";
3+
4+
export const en: LanguageTestData = {
5+
tokenizer: EN,
6+
docs: [
7+
"In on announcing if of comparison pianoforte projection. Maids hoped gay yet bed asked blind dried point. On abroad danger likely regret twenty edward do. Too horrible consider followed may differed age.",
8+
"By so delight of showing neither believe he present. Deal sigh up in shew away when. Pursuit considering express no or prepare replied."
9+
],
10+
tests: [{
11+
what: "find the word",
12+
search: "announcing",
13+
expected: [0]
14+
},{
15+
what: "find the word",
16+
search: "believe",
17+
expected: [1]
18+
}, {
19+
what: "find the word",
20+
search: "consider",
21+
expected: [0, 1]
22+
}, {
23+
what: "find the word",
24+
search: "show",
25+
expected: [1]
26+
}, {
27+
what: "never find a word that does not exist, like",
28+
search: "inexistent",
29+
expected: []
30+
}, {
31+
what: "never find a stop word like",
32+
search: "neither",
33+
expected: []
34+
}, {
35+
what: "find a correctly stemmed word",
36+
search: "show",
37+
expected: [1]
38+
}]
39+
};
40+
41+
createLanguageTest("en", en);

0 commit comments

Comments
 (0)