Skip to content

Commit

Permalink
feat: add stop words for Hungarian language (#2069)
Browse files Browse the repository at this point in the history
  • Loading branch information
tnxbutno authored Jun 2, 2023
1 parent 3942fc6 commit 4b7c485
Show file tree
Hide file tree
Showing 3 changed files with 205 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/tokenizer/stop_word_filter/gen_stopwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"finnish",
"french",
"german",
"hungarian",
"italian",
"norwegian",
"portuguese",
Expand Down
1 change: 1 addition & 0 deletions src/tokenizer/stop_word_filter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ impl StopWordFilter {
Language::Finnish => stopwords::FINNISH,
Language::French => stopwords::FRENCH,
Language::German => stopwords::GERMAN,
Language::Hungarian => stopwords::HUNGARIAN,
Language::Italian => stopwords::ITALIAN,
Language::Norwegian => stopwords::NORWEGIAN,
Language::Portuguese => stopwords::PORTUGUESE,
Expand Down
204 changes: 203 additions & 1 deletion src/tokenizer/stop_word_filter/stopwords.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
These stop word lists are from the Snowball project (https://snowballstem.org/)
which carries the following license:
which carries the following copyright and license:
Copyright (c) 2001, Dr Martin Porter
Copyright (c) 2004,2005, Richard Boulton
Expand Down Expand Up @@ -862,6 +862,208 @@ pub const GERMAN: &[&str] = &[
"zwischen",
];

pub const HUNGARIAN: &[&str] = &[
"a",
"ahogy",
"ahol",
"aki",
"akik",
"akkor",
"alatt",
"által",
"általában",
"amely",
"amelyek",
"amelyekben",
"amelyeket",
"amelyet",
"amelynek",
"ami",
"amit",
"amolyan",
"amíg",
"amikor",
"át",
"abban",
"ahhoz",
"annak",
"arra",
"arról",
"az",
"azok",
"azon",
"azt",
"azzal",
"azért",
"aztán",
"azután",
"azonban",
"bár",
"be",
"belül",
"benne",
"cikk",
"cikkek",
"cikkeket",
"csak",
"de",
"e",
"eddig",
"egész",
"egy",
"egyes",
"egyetlen",
"egyéb",
"egyik",
"egyre",
"ekkor",
"el",
"elég",
"ellen",
"elő",
"először",
"előtt",
"első",
"én",
"éppen",
"ebben",
"ehhez",
"emilyen",
"ennek",
"erre",
"ez",
"ezt",
"ezek",
"ezen",
"ezzel",
"ezért",
"és",
"fel",
"felé",
"hanem",
"hiszen",
"hogy",
"hogyan",
"igen",
"így",
"illetve",
"ill.",
"ill",
"ilyen",
"ilyenkor",
"ison",
"ismét",
"itt",
"jó",
"jól",
"jobban",
"kell",
"kellett",
"keresztül",
"keressünk",
"ki",
"kívül",
"között",
"közül",
"legalább",
"lehet",
"lehetett",
"legyen",
"lenne",
"lenni",
"lesz",
"lett",
"maga",
"magát",
"majd",
"majd",
"már",
"más",
"másik",
"meg",
"még",
"mellett",
"mert",
"mely",
"melyek",
"mi",
"mit",
"míg",
"miért",
"milyen",
"mikor",
"minden",
"mindent",
"mindenki",
"mindig",
"mint",
"mintha",
"mivel",
"most",
"nagy",
"nagyobb",
"nagyon",
"ne",
"néha",
"nekem",
"neki",
"nem",
"néhány",
"nélkül",
"nincs",
"olyan",
"ott",
"össze",
"ő",
"ők",
"őket",
"pedig",
"persze",
"rá",
"s",
"saját",
"sem",
"semmi",
"sok",
"sokat",
"sokkal",
"számára",
"szemben",
"szerint",
"szinte",
"talán",
"tehát",
"teljes",
"tovább",
"továbbá",
"több",
"úgy",
"ugyanis",
"új",
"újabb",
"újra",
"után",
"utána",
"utolsó",
"vagy",
"vagyis",
"valaki",
"valami",
"valamint",
"való",
"vagyok",
"van",
"vannak",
"volt",
"voltam",
"voltak",
"voltunk",
"vissza",
"vele",
"viszont",
"volna",
];

pub const ITALIAN: &[&str] = &[
"ad",
"al",
Expand Down

0 comments on commit 4b7c485

Please sign in to comment.