1
1
/*
2
2
* From MihaiValentin/lunr-languages.
3
- * Last update from 04/16/2017 - 19af41fb9bd644d9081ad274f96f700b21464290
3
+ * Last update from 2017/ 04/16 - 19af41fb9bd644d9081ad274f96f700b21464290
4
4
*/
5
- import { generateTrimmer , generateStopWordFilter , Among , SnowballProgram } from "./support" ;
6
- import { Tokenizer } from "../tokenizer" ;
5
+ import { Tokenizer } from "../../full-text-search/src/index" ;
6
+ import {
7
+ generateTrimmer ,
8
+ generateStopWordFilter ,
9
+ Among ,
10
+ SnowballProgram
11
+ } from "../../full-text-search-language/src/language" ;
7
12
8
- let wordCharacters = "A-Za-z\xAA\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02B8\u02E0-\u02E4\u1D00-\u1D25\u1D2C-\u1D5C\u1D62-\u1D65\u1D6B-\u1D77\u1D79-\u1DBE\u1E00-\u1EFF\u2071\u207F\u2090-\u209C\u212A\u212B\u2132\u214E\u2160-\u2188\u2C60-\u2C7F\uA722-\uA787\uA78B-\uA7AD\uA7B0-\uA7B7\uA7F7-\uA7FF\uAB30-\uAB5A\uAB5C-\uAB64\uFB00-\uFB06\uFF21-\uFF3A\uFF41-\uFF5A" ;
9
- let trimmer = generateTrimmer ( wordCharacters ) ;
13
+ class GermanStemmer {
14
+ public getCurrent : ( ) => string ;
15
+ public setCurrent : ( word : string ) => void ;
16
+ public stem : ( ) => void ;
10
17
11
- let tkz = new Tokenizer ( ) ;
12
-
13
- tkz . add ( "trimmer-de" , trimmer ) ;
14
-
15
- let stemmer = ( ( ( ) => {
16
- /* create the wrapped stemmer object */
17
- let st = new ( function GermanStemmer ( ) {
18
+ constructor ( ) {
19
+ // Write everything in the constructor to reduce code size and increase performance.
20
+ // The original implementation uses a ES5 anonymous function class.
18
21
let a_0 = [ new Among ( "" , - 1 , 6 ) , new Among ( "U" , 0 , 2 ) ,
19
22
new Among ( "Y" , 0 , 1 ) , new Among ( "\u00E4" , 0 , 3 ) ,
20
23
new Among ( "\u00F6" , 0 , 4 ) , new Among ( "\u00FC" , 0 , 5 )
@@ -53,16 +56,17 @@ let stemmer = ((() => {
53
56
117 , 30 , 4
54
57
] ;
55
58
56
- let I_x ;
57
- let I_p2 ;
58
- let I_p1 ;
59
+ let I_x : number ;
60
+ let I_p2 : number ;
61
+ let I_p1 : number ;
59
62
let sbp = new SnowballProgram ( ) ;
60
- this . setCurrent = ( word ) => {
63
+
64
+ this . setCurrent = ( word : string ) => {
61
65
sbp . setCurrent ( word ) ;
62
66
} ;
63
67
this . getCurrent = ( ) => sbp . getCurrent ( ) ;
64
68
65
- function habr1 ( c1 , c2 , v_1 ) {
69
+ function habr1 ( c1 : string , c2 : string , v_1 : number ) {
66
70
if ( sbp . eq_s ( 1 , c1 ) ) {
67
71
sbp . ket = sbp . cursor ;
68
72
if ( sbp . in_grouping ( g_v , 97 , 252 ) ) {
@@ -304,19 +308,12 @@ let stemmer = ((() => {
304
308
r_standard_suffix ( ) ;
305
309
sbp . cursor = sbp . limit_backward ;
306
310
r_postlude ( ) ;
307
- return true ;
308
311
} ;
309
- } ) ;
310
-
311
- /* and return a function that stems a word for the current locale */
312
- return ( token ) => {
313
- st . setCurrent ( token ) ;
314
- st . stem ( ) ;
315
- return st . getCurrent ( ) ;
316
- } ;
317
- } ) ) ( ) ;
312
+ }
313
+ }
318
314
319
- tkz . setSplitter ( "whitespace-splitter" , function defaultSplitter ( str ) {
315
+ // Split at whitespace and dashes.
316
+ function splitter ( str : string ) {
320
317
let trimmedTokens = [ ] ;
321
318
let tokens = str . split ( / [ \s - ] + / ) ;
322
319
for ( let i = 0 ; i < tokens . length ; i ++ ) {
@@ -325,11 +322,22 @@ tkz.setSplitter("whitespace-splitter", function defaultSplitter(str) {
325
322
}
326
323
}
327
324
return trimmedTokens ;
328
- } ) ;
325
+ }
326
+
327
+ const st = new GermanStemmer ( ) ;
329
328
330
- tkz . add ( "stemmer-de" , stemmer ) ;
329
+ function stemmer ( token : string ) {
330
+ st . setCurrent ( token ) ;
331
+ st . stem ( ) ;
332
+ return st . getCurrent ( ) ;
333
+ }
331
334
332
- let stopWordFilter = generateStopWordFilter ( [ "aber" , "alle" , "allem" , "allen" , "aller" , "alles" , "als" , "also" , "am" , "an" , "ander" , "andere" , "anderem" , "anderen" , "anderer" , "anderes" , "anderm" , "andern" , "anderr" , "anders" , "auch" , "auf" , "aus" , "bei" , "bin" , "bis" , "bist" , "da" , "damit" , "dann" , "das" , "dasselbe" , "dazu" , "daß" , "dein" , "deine" , "deinem" , "deinen" , "deiner" , "deines" , "dem" , "demselben" , "den" , "denn" , "denselben" , "der" , "derer" , "derselbe" , "derselben" , "des" , "desselben" , "dessen" , "dich" , "die" , "dies" , "diese" , "dieselbe" , "dieselben" , "diesem" , "diesen" , "dieser" , "dieses" , "dir" , "doch" , "dort" , "du" , "durch" , "ein" , "eine" , "einem" , "einen" , "einer" , "eines" , "einig" , "einige" , "einigem" , "einigen" , "einiger" , "einiges" , "einmal" , "er" , "es" , "etwas" , "euch" , "euer" , "eure" , "eurem" , "euren" , "eurer" , "eures" , "für" , "gegen" , "gewesen" , "hab" , "habe" , "haben" , "hat" , "hatte" , "hatten" , "hier" , "hin" , "hinter" , "ich" , "ihm" , "ihn" , "ihnen" , "ihr" , "ihre" , "ihrem" , "ihren" , "ihrer" , "ihres" , "im" , "in" , "indem" , "ins" , "ist" , "jede" , "jedem" , "jeden" , "jeder" , "jedes" , "jene" , "jenem" , "jenen" , "jener" , "jenes" , "jetzt" , "kann" , "kein" , "keine" , "keinem" , "keinen" , "keiner" , "keines" , "können" , "könnte" , "machen" , "man" , "manche" , "manchem" , "manchen" , "mancher" , "manches" , "mein" , "meine" , "meinem" , "meinen" , "meiner" , "meines" , "mich" , "mir" , "mit" , "muss" , "musste" , "nach" , "nicht" , "nichts" , "noch" , "nun" , "nur" , "ob" , "oder" , "ohne" , "sehr" , "sein" , "seine" , "seinem" , "seinen" , "seiner" , "seines" , "selbst" , "sich" , "sie" , "sind" , "so" , "solche" , "solchem" , "solchen" , "solcher" , "solches" , "soll" , "sollte" , "sondern" , "sonst" , "um" , "und" , "uns" , "unse" , "unsem" , "unsen" , "unser" , "unses" , "unter" , "viel" , "vom" , "von" , "vor" , "war" , "waren" , "warst" , "was" , "weg" , "weil" , "weiter" , "welche" , "welchem" , "welchen" , "welcher" , "welches" , "wenn" , "werde" , "werden" , "wie" , "wieder" , "will" , "wir" , "wird" , "wirst" , "wo" , "wollen" , "wollte" , "während" , "würde" , "würden" , "zu" , "zum" , "zur" , "zwar" , "zwischen" , "über" ] ) ;
333
- tkz . add ( "stopWordFilter-de" , stopWordFilter ) ;
335
+ const trimmer = generateTrimmer ( "A-Za-z\xAA\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02B8\u02E0-\u02E4\u1D00-\u1D25\u1D2C-\u1D5C\u1D62-\u1D65\u1D6B-\u1D77\u1D79-\u1DBE\u1E00-\u1EFF\u2071\u207F\u2090-\u209C\u212A\u212B\u2132\u214E\u2160-\u2188\u2C60-\u2C7F\uA722-\uA787\uA78B-\uA7AD\uA7B0-\uA7B7\uA7F7-\uA7FF\uAB30-\uAB5A\uAB5C-\uAB64\uFB00-\uFB06\uFF21-\uFF3A\uFF41-\uFF5A" ) ;
336
+ const stopWordFilter = generateStopWordFilter ( [ "aber" , "alle" , "allem" , "allen" , "aller" , "alles" , "als" , "also" , "am" , "an" , "ander" , "andere" , "anderem" , "anderen" , "anderer" , "anderes" , "anderm" , "andern" , "anderr" , "anders" , "auch" , "auf" , "aus" , "bei" , "bin" , "bis" , "bist" , "da" , "damit" , "dann" , "das" , "dasselbe" , "dazu" , "daß" , "dein" , "deine" , "deinem" , "deinen" , "deiner" , "deines" , "dem" , "demselben" , "den" , "denn" , "denselben" , "der" , "derer" , "derselbe" , "derselben" , "des" , "desselben" , "dessen" , "dich" , "die" , "dies" , "diese" , "dieselbe" , "dieselben" , "diesem" , "diesen" , "dieser" , "dieses" , "dir" , "doch" , "dort" , "du" , "durch" , "ein" , "eine" , "einem" , "einen" , "einer" , "eines" , "einig" , "einige" , "einigem" , "einigen" , "einiger" , "einiges" , "einmal" , "er" , "es" , "etwas" , "euch" , "euer" , "eure" , "eurem" , "euren" , "eurer" , "eures" , "für" , "gegen" , "gewesen" , "hab" , "habe" , "haben" , "hat" , "hatte" , "hatten" , "hier" , "hin" , "hinter" , "ich" , "ihm" , "ihn" , "ihnen" , "ihr" , "ihre" , "ihrem" , "ihren" , "ihrer" , "ihres" , "im" , "in" , "indem" , "ins" , "ist" , "jede" , "jedem" , "jeden" , "jeder" , "jedes" , "jene" , "jenem" , "jenen" , "jener" , "jenes" , "jetzt" , "kann" , "kein" , "keine" , "keinem" , "keinen" , "keiner" , "keines" , "können" , "könnte" , "machen" , "man" , "manche" , "manchem" , "manchen" , "mancher" , "manches" , "mein" , "meine" , "meinem" , "meinen" , "meiner" , "meines" , "mich" , "mir" , "mit" , "muss" , "musste" , "nach" , "nicht" , "nichts" , "noch" , "nun" , "nur" , "ob" , "oder" , "ohne" , "sehr" , "sein" , "seine" , "seinem" , "seinen" , "seiner" , "seines" , "selbst" , "sich" , "sie" , "sind" , "so" , "solche" , "solchem" , "solchen" , "solcher" , "solches" , "soll" , "sollte" , "sondern" , "sonst" , "um" , "und" , "uns" , "unse" , "unsem" , "unsen" , "unser" , "unses" , "unter" , "viel" , "vom" , "von" , "vor" , "war" , "waren" , "warst" , "was" , "weg" , "weil" , "weiter" , "welche" , "welchem" , "welchen" , "welcher" , "welches" , "wenn" , "werde" , "werden" , "wie" , "wieder" , "will" , "wir" , "wird" , "wirst" , "wo" , "wollen" , "wollte" , "während" , "würde" , "würden" , "zu" , "zum" , "zur" , "zwar" , "zwischen" , "über" ] ) ;
334
337
335
- export { tkz as DE } ;
338
+ // Create, configure and export the tokenizer.
339
+ export const DE : Tokenizer = new Tokenizer ( ) ;
340
+ DE . setSplitter ( "whitespace-splitter" , splitter ) ;
341
+ DE . add ( "trimmer-de" , trimmer ) ;
342
+ DE . add ( "stemmer-de" , stemmer ) ;
343
+ DE . add ( "stopWordFilter-de" , stopWordFilter ) ;
0 commit comments