Skip to content

Commit

Permalink
OAK-11276: add unit test for hunspell stem + documentation for elastic (
Browse files Browse the repository at this point in the history
#1871)

* OAK-11276: add unit test for hunspell stem + documentation for elastic

* OAK-11276: fix rat check, reduced simplified dictionary

* OAK-11276: simplified affix
  • Loading branch information
fabriziofortino authored Nov 21, 2024
1 parent 2639bdb commit cbe84ba
Show file tree
Hide file tree
Showing 6 changed files with 996 additions and 35 deletions.
1 change: 1 addition & 0 deletions oak-doc/src/site/markdown/query/elastic.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ however there are differences:
* `analyzers` support the Lucene configuration plus Elasticsearch specific [options][options]. Since Elasticsearch uses
a more recent version of Lucene compared to the one in `oak-lucene` module, there might be differences in configuration options
that could require changes when migrating from Lucene to Elasticsearch.
The `HunspellStem` filter is not supported since dictionary files are required in the Elasticsearch cluster filesystem.
* `useInExcerpt` does not support regexp relative properties.
* For property definitions, `sync` and `unique` are ignored.
Synchronous indexing, and enforcing uniqueness constraints is not currently supported in elastic indexes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.jackrabbit.oak.plugins.index.elastic.query.async.ElasticResultRowAsyncIterator;
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.junit.ClassRule;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.event.Level;

Expand Down Expand Up @@ -253,4 +254,8 @@ public void fulltextSearchWithSnowball() throws Exception {
assertEventually(() -> assertQuery("select * from [nt:base] where CONTAINS(*, 'mangiare')", List.of("/content/bar")));
}

@Test
@Ignore("not supported in elasticsearch since hunspell resources need to be available on the server")
@Override
public void fullTextWithHunspell() {}
}
2 changes: 2 additions & 0 deletions oak-search/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@
<excludes>
<exclude>**/lms-data.tsv</exclude>
<exclude>**/stopwords-snowball.txt</exclude>
<exclude>**/fr-affix.txt</exclude>
<exclude>**/fr-dictionary.txt</exclude>
</excludes>
</configuration>
</plugin>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
*/
package org.apache.jackrabbit.oak.plugins.index;

import org.apache.jackrabbit.JcrConstants;
import org.apache.jackrabbit.oak.api.QueryEngine;
import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.api.Type;
Expand All @@ -38,6 +37,7 @@

import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.ANALYZERS;
import static org.apache.jackrabbit.oak.spi.nodetype.NodeTypeConstants.NT_OAK_UNSTRUCTURED;
import static org.hamcrest.CoreMatchers.containsString;
Expand Down Expand Up @@ -260,10 +260,10 @@ public void fulltextSearchWithCustomComposedFilters() throws Exception {

Tree stopFilter = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "Stop");
stopFilter.setProperty("words", "stop1.txt, stop2.txt");
stopFilter.addChild("stop1.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "foo");
stopFilter.addChild("stop2.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "and");
stopFilter.addChild("stop1.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "foo");
stopFilter.addChild("stop2.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "and");
});

Tree content = root.getTree("/").addChild("content");
Expand All @@ -284,17 +284,17 @@ public void fulltextSearchWithCustomComposedAnalyzer() throws Exception {
addFilter(charFilters, "HTMLStrip");
Tree mappingFilter = addFilter(charFilters, "Mapping");
mappingFilter.setProperty("mapping", "mappings.txt");
mappingFilter.addChild("mappings.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, getHinduArabicMapping());
mappingFilter.addChild("mappings.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, getHinduArabicMapping());

Tree filters = anl.addChild(FulltextIndexConstants.ANL_FILTERS);
addFilter(filters, "LowerCase");
Tree stopFilter = addFilter(filters, "Stop");
stopFilter.setProperty("words", "stop1.txt, stop2.txt");
stopFilter.addChild("stop1.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "my");
stopFilter.addChild("stop2.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "is");
stopFilter.addChild("stop1.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "my");
stopFilter.addChild("stop2.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "is");
addFilter(filters, "PorterStem");
});

Expand Down Expand Up @@ -326,17 +326,17 @@ public void fulltextSearchWithCustomComposedAnalyzerWithComments() throws Except
Tree charFilters = anl.addChild(FulltextIndexConstants.ANL_CHAR_FILTERS);
Tree mappingFilter = addFilter(charFilters, "Mapping");
mappingFilter.setProperty("mapping", "mapping-ISOLatin1Accent.txt");
mappingFilter.addChild("mapping-ISOLatin1Accent.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, mappings);
mappingFilter.addChild("mapping-ISOLatin1Accent.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, mappings);

Tree filters = anl.addChild(FulltextIndexConstants.ANL_FILTERS);
Tree synFilter = addFilter(filters, "Synonym");
synFilter.setProperty("synonyms", "syn.txt");
synFilter.setProperty("format", "solr");
synFilter.setProperty("expand", "true");
synFilter.setProperty("tokenizerFactory", "standard");
synFilter.addChild("syn.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "# Synonym mappings can be used for spelling correction too\n" +
synFilter.addChild("syn.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "# Synonym mappings can be used for spelling correction too\n" +
"tool => instrument");

addFilter(filters, "LowerCase");
Expand All @@ -345,8 +345,8 @@ public void fulltextSearchWithCustomComposedAnalyzerWithComments() throws Except
stopFilter.setProperty("enablePositionIncrements", "true");
stopFilter.setProperty("ignoreCase", "true");
stopFilter.setProperty("words", "stopwords-snowball.txt");
stopFilter.addChild("stopwords-snowball.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, stopwords);
stopFilter.addChild("stopwords-snowball.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, stopwords);
});

Tree content = root.getTree("/").addChild("content");
Expand Down Expand Up @@ -409,8 +409,8 @@ public void fulltextSearchWithProtectedStemmer() throws Exception {
addFilter(filters, "LowerCase");
Tree marker = addFilter(filters, "KeywordMarker");
marker.setProperty("protected", "protected.txt");
marker.addChild("protected.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "# some comment here\nrunning");
marker.addChild("protected.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "# some comment here\nrunning");
addFilter(filters, "PorterStem");
});

Expand Down Expand Up @@ -467,7 +467,7 @@ public void fulltextSearchWithAsciiFolding() throws Exception {

Tree asciiFilter = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "AsciiFolding");
asciiFilter.setProperty("preserveOriginal", "true");
asciiFilter.setProperty(JcrConstants.JCR_PRIMARYTYPE, NT_OAK_UNSTRUCTURED, Type.NAME);
asciiFilter.setProperty(JCR_PRIMARYTYPE, NT_OAK_UNSTRUCTURED, Type.NAME);
});

Tree content = root.getTree("/").addChild("content");
Expand Down Expand Up @@ -511,8 +511,8 @@ public void fulltextSearchWithCommonGrams() throws Exception {

Tree commonGrams = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "CommonGrams");
commonGrams.setProperty("words", "words.txt");
commonGrams.addChild("words.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "is\nthe");
commonGrams.addChild("words.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "is\nthe");

});

Expand Down Expand Up @@ -626,8 +626,8 @@ public void fulltextSearchWithElision() throws Exception {

Tree elision = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "Elision");
elision.setProperty("articles", "articles.txt");
elision.addChild("articles.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "j\ns\nc\nt");
elision.addChild("articles.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "j\ns\nc\nt");
});

Tree content = root.getTree("/").addChild("content");
Expand All @@ -646,8 +646,8 @@ public void fulltextSearchWithKeepWord() throws Exception {

Tree kw = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "KeepWord");
kw.setProperty("words", "words.txt");
kw.addChild("words.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "dog\nelephant\nfox");
kw.addChild("words.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "dog\nelephant\nfox");
});

Tree content = root.getTree("/").addChild("content");
Expand Down Expand Up @@ -769,8 +769,8 @@ public void fulltextSearchWithDictionaryCompounderFilter() throws Exception {
Tree filters = anl.addChild(FulltextIndexConstants.ANL_FILTERS);
Tree dd = addFilter(filters, "DictionaryCompoundWord");
dd.setProperty("dictionary", "words.txt");
dd.addChild("words.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "Donau\ndampf\nmeer\nschiff");
dd.addChild("words.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "Donau\ndampf\nmeer\nschiff");
});

Tree content = root.getTree("/").addChild("content");
Expand Down Expand Up @@ -816,8 +816,8 @@ public void fullTextSearchWithTypeTokenFilter() throws Exception {

Tree type = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "Type");
type.setProperty("types", "stopTypes.txt");
type.addChild("stopTypes.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "<NUM>\n<SYNONYM>");
type.addChild("stopTypes.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "<NUM>\n<SYNONYM>");
});

Tree content = root.getTree("/").addChild("content");
Expand All @@ -839,8 +839,8 @@ public void fullTextSearchWithWhitelistedTypeTokenFilter() throws Exception {
Tree type = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "Type");
type.setProperty("types", "stopTypes.txt");
type.setProperty("useWhitelist", "true");
type.addChild("stopTypes.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "<NUM>\n<SYNONYM>");
type.addChild("stopTypes.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "<NUM>\n<SYNONYM>");
});

Tree content = root.getTree("/").addChild("content");
Expand All @@ -853,15 +853,75 @@ public void fullTextSearchWithWhitelistedTypeTokenFilter() throws Exception {
});
}

@Test
public void fullTextWithHunspell() throws Exception {
String affix = new String(getClass().getClassLoader()
.getResourceAsStream("fr-affix.txt").readAllBytes(), StandardCharsets.UTF_8);
String dictionary = new String(getClass().getClassLoader()
.getResourceAsStream("fr-dictionary.txt").readAllBytes(), StandardCharsets.UTF_8);
setup(List.of("foo"), idx -> {
Tree anl = idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME, "Standard");
Tree hunspellStem = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "HunspellStem");
hunspellStem.setProperty("affix", "fr.aff");
hunspellStem.addChild("fr.aff").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, affix);
hunspellStem.setProperty("dictionary", "fr.dic");
hunspellStem.addChild("fr.dic").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, dictionary);
});

Tree content = root.getTree("/").addChild("content");
content.addChild("bar").setProperty("foo", "mangé");
content.addChild("baz").setProperty("foo", "chevaux");
root.commit();

assertEventually(() -> {
assertQuery("select * from [nt:base] where ISDESCENDANTNODE('/content') and CONTAINS(*, 'manger')", List.of("/content/bar"));
assertQuery("select * from [nt:base] where ISDESCENDANTNODE('/content') and CONTAINS(*, 'cheval')", List.of("/content/baz"));
});
}

/**
* This test verifies that the FrenchLightStemmer, an algorithmic stemmer,
* produces the same results as the dictionary-based Hunspell stemmer (see FullTextAnalyzerCommonsTest#fullTextWithHunspell)
* for the given French words.
*/
@Test
public void fullTextWithFrenchLightStemmer() throws Exception {
setup(List.of("foo"), idx -> {
Tree anl = idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME, "Standard");

Tree filters = anl.addChild(FulltextIndexConstants.ANL_FILTERS);
addFilter(filters, "LowerCase");
Tree elision = addFilter(filters, "Elision");
elision.setProperty("articles", "articles.txt");
elision.addChild("articles.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "l\nm\nt\nqu\nn\ns\nj\nd\nc\njusqu\nquoiqu\nlorsqu\npuisqu");
addFilter(filters, "FrenchLightStem");
});

Tree content = root.getTree("/").addChild("content");
content.addChild("bar").setProperty("foo", "mangé");
content.addChild("baz").setProperty("foo", "chevaux");
root.commit();

assertEventually(() -> {
assertQuery("select * from [nt:base] where ISDESCENDANTNODE('/content') and CONTAINS(*, 'manger')", List.of("/content/bar"));
assertQuery("select * from [nt:base] where ISDESCENDANTNODE('/content') and CONTAINS(*, 'cheval')", List.of("/content/baz"));
});
}

@Test
public void synonyms() throws Exception {
setup(List.of("foo"), idx -> {
Tree anl = idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME, "Standard");
Tree synFilter = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "Synonym");
synFilter.setProperty("synonyms", "syn.txt");
synFilter.addChild("syn.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "plane, airplane, aircraft\nflies=>scars");
synFilter.addChild("syn.txt").addChild(JCR_CONTENT)
.setProperty(JCR_DATA, "plane, airplane, aircraft\nflies=>scars");
});

Tree content = root.getTree("/").addChild("content");
Expand Down Expand Up @@ -930,7 +990,7 @@ public void wildcardQueryToLookupUnanalyzedText() throws Exception {
protected Tree addFilter(Tree analyzer, String filterName) {
Tree filter = analyzer.addChild(filterName);
// mimics nodes api
filter.setProperty(JcrConstants.JCR_PRIMARYTYPE, NT_OAK_UNSTRUCTURED, Type.NAME);
filter.setProperty(JCR_PRIMARYTYPE, NT_OAK_UNSTRUCTURED, Type.NAME);
try {
filter.setProperty("binary", root.createBlob(new ByteArrayInputStream(new byte[0])), Type.BINARY);
} catch (IOException e) {
Expand Down
Loading

0 comments on commit cbe84ba

Please sign in to comment.