Skip to content

Commit a07efca

Browse files
maximesteinkottmann
authored andcommitted
OPENNLP-1236: Add arabic and greek stemmers support (#345)
1 parent 92e4456 commit a07efca

File tree

5 files changed

+6010
-1
lines changed

5 files changed

+6010
-1
lines changed

opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballProgram.java

+18
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,11 @@ protected boolean eq_s(int s_size, String s)
190190
return true;
191191
}
192192

193+
protected boolean eq_s(CharSequence s)
194+
{
195+
return eq_s(s.length(), s.toString());
196+
}
197+
193198
protected boolean eq_s_b(int s_size, String s)
194199
{
195200
if (cursor - limit_backward < s_size) return false;
@@ -201,6 +206,11 @@ protected boolean eq_s_b(int s_size, String s)
201206
return true;
202207
}
203208

209+
protected boolean eq_s_b(CharSequence s)
210+
{
211+
return eq_s_b(s.length(), s.toString());
212+
}
213+
204214
protected boolean eq_v(CharSequence s)
205215
{
206216
return eq_s(s.length(), s.toString());
@@ -282,6 +292,10 @@ protected int find_among(Among v[], int v_size)
282292
}
283293
}
284294

295+
protected int find_among(Among v[]){
296+
return find_among(v, v.length);
297+
}
298+
285299
// find_among_b is for backwards processing. Same comments apply
286300
protected int find_among_b(Among v[], int v_size)
287301
{
@@ -351,6 +365,10 @@ protected int find_among_b(Among v[], int v_size)
351365
}
352366
}
353367

368+
protected int find_among_b(Among v[]){
369+
return find_among_b(v, v.length);
370+
}
371+
354372
/* to replace chars between c_bra and c_ket in current by the
355373
* chars in s.
356374
*/

opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java

+9-1
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@
2222
public class SnowballStemmer implements Stemmer {
2323

2424
public enum ALGORITHM {
25+
ARABIC,
2526
DANISH,
2627
DUTCH,
2728
ENGLISH,
2829
FINNISH,
2930
FRENCH,
3031
GERMAN,
32+
GREEK,
3133
HUNGARIAN,
3234
IRISH,
3335
ITALIAN,
@@ -47,7 +49,10 @@ public enum ALGORITHM {
4749
public SnowballStemmer(ALGORITHM algorithm, int repeat) {
4850
this.repeat = repeat;
4951

50-
if (ALGORITHM.DANISH.equals(algorithm)) {
52+
if (ALGORITHM.ARABIC.equals(algorithm)) {
53+
stemmer = new arabicStemmer();
54+
}
55+
else if (ALGORITHM.DANISH.equals(algorithm)) {
5156
stemmer = new danishStemmer();
5257
}
5358
else if (ALGORITHM.DUTCH.equals(algorithm)) {
@@ -65,6 +70,9 @@ else if (ALGORITHM.FRENCH.equals(algorithm)) {
6570
else if (ALGORITHM.GERMAN.equals(algorithm)) {
6671
stemmer = new germanStemmer();
6772
}
73+
else if (ALGORITHM.GREEK.equals(algorithm)) {
74+
stemmer = new greekStemmer();
75+
}
6876
else if (ALGORITHM.HUNGARIAN.equals(algorithm)) {
6977
stemmer = new hungarianStemmer();
7078
}

0 commit comments

Comments
 (0)