Skip to content

Commit

Permalink
Expose DelimitedTermFrequencyTokenFilter (opensearch-project#9479)
Browse files Browse the repository at this point in the history
* Expose DelimitedTermFrequencyTokenFilter

Relates: opensearch-project#9413

This commit exposes Lucene's delimited term frequency token filter to be
able to provide term frequencies along with terms.

Signed-off-by: Russ Cam <russcam@canva.com>

* fix format violations

Signed-off-by: Russ Cam <russcam@canva.com>

* fix test and add to changelog

Signed-off-by: Russ Cam <russcam@canva.com>

* Address PR feedback

- Add unit tests for DelimitedTermFrequencyTokenFilterFactory
- Remove IllegalArgumentException as caught exception
- Add skip to yaml rest tests to skip for version < 2.10

Signed-off-by: Russ Cam <russcam@canva.com>

* formatting

Signed-off-by: Russ Cam <russcam@canva.com>

* Rename filter

Signed-off-by: Russ Cam <russcam@canva.com>

* update naming in REST tests

Signed-off-by: Russ Cam <russcam@canva.com>

---------

Signed-off-by: Russ Cam <russcam@canva.com>
Signed-off-by: Shivansh Arora <hishiv@amazon.com>
  • Loading branch information
russcam authored and shiv0408 committed Apr 25, 2024
1 parent 25a2624 commit 7575ba5
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- [BWC and API enforcement] Decorate the existing APIs with proper annotations (part 1) ([#9520](https://github.com/opensearch-project/OpenSearch/pull/9520))
- Add concurrent segment search related metrics to node and index stats ([#9622](https://github.com/opensearch-project/OpenSearch/issues/9622))
- Decouple replication lag from logic to fail stale replicas ([#9507](https://github.com/opensearch-project/OpenSearch/pull/9507))
- Expose DelimitedTermFrequencyTokenFilter to allow providing term frequencies along with terms ([#9479](https://github.com/opensearch-project/OpenSearch/pull/9479))

### Dependencies
- Bump `org.apache.logging.log4j:log4j-core` from 2.17.1 to 2.20.0 ([#8307](https://github.com/opensearch-project/OpenSearch/pull/8307))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
Expand Down Expand Up @@ -265,6 +266,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
);
filters.put("decimal_digit", DecimalDigitFilterFactory::new);
filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new);
filters.put("delimited_term_freq", DelimitedTermFrequencyTokenFilterFactory::new);
filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
filters.put("dutch_stem", DutchStemTokenFilterFactory::new);
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
Expand Down Expand Up @@ -500,6 +502,13 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
)
)
);
filters.add(
PreConfiguredTokenFilter.singleton(
"delimited_term_freq",
false,
input -> new DelimitedTermFrequencyTokenFilter(input, DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER)
)
);
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1)));
filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;

public class DelimitedTermFrequencyTokenFilterFactory extends AbstractTokenFilterFactory {
public static final char DEFAULT_DELIMITER = '|';
private static final String DELIMITER = "delimiter";
private final char delimiter;

DelimitedTermFrequencyTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
delimiter = parseDelimiter(settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new DelimitedTermFrequencyTokenFilter(tokenStream, delimiter);
}

private static char parseDelimiter(Settings settings) {
String delimiter = settings.get(DELIMITER);
if (delimiter == null) {
return DEFAULT_DELIMITER;
} else if (delimiter.length() == 1) {
return delimiter.charAt(0);
}

throw new IllegalArgumentException(
"Setting [" + DELIMITER + "] must be a single, non-null character. [" + delimiter + "] was provided."
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("cjkwidth", CJKWidthFilterFactory.class);
filters.put("cjkbigram", CJKBigramFilterFactory.class);
filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class);
filters.put("delimitedtermfrequency", DelimitedTermFrequencyTokenFilterFactory.class);
filters.put("keepword", KeepWordFilterFactory.class);
filters.put("type", KeepTypesFilterFactory.class);
filters.put("classic", ClassicFilterFactory.class);
Expand Down Expand Up @@ -202,6 +203,7 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
filters.put("decimal_digit", null);
filters.put("delimited_payload_filter", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class);
filters.put("delimited_payload", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class);
filters.put("delimited_term_freq", org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory.class);
filters.put("dutch_stem", SnowballPorterFilterFactory.class);
filters.put("edge_ngram", null);
filters.put("edgeNGram", null);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.analysis.AnalysisTestsHelper;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.test.OpenSearchTestCase;
import org.opensearch.test.OpenSearchTokenStreamTestCase;

import java.io.StringReader;

public class DelimitedTermFrequencyTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase {

public void testDefault() throws Exception {
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq")
.build(),
new CommonAnalysisModulePlugin()
);
doTest(analysis, "cat|4 dog|5");
}

public void testDelimiter() throws Exception {
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq")
.put("index.analysis.filter.my_delimited_term_freq.delimiter", ":")
.build(),
new CommonAnalysisModulePlugin()
);
doTest(analysis, "cat:4 dog:5");
}

public void testDelimiterLongerThanOneCharThrows() {
IllegalArgumentException ex = expectThrows(
IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq")
.put("index.analysis.filter.my_delimited_term_freq.delimiter", "^^")
.build(),
new CommonAnalysisModulePlugin()
)
);

assertEquals("Setting [delimiter] must be a single, non-null character. [^^] was provided.", ex.getMessage());
}

private void doTest(OpenSearchTestCase.TestAnalysis analysis, String source) throws Exception {
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_delimited_term_freq");
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));

TokenStream stream = tokenFilter.create(tokenizer);

CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
TermFrequencyAttribute tfAtt = stream.getAttribute(TermFrequencyAttribute.class);
stream.reset();
assertTermEquals("cat", stream, termAtt, tfAtt, 4);
assertTermEquals("dog", stream, termAtt, tfAtt, 5);
assertFalse(stream.incrementToken());
stream.end();
stream.close();
}

void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf)
throws Exception {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
assertEquals(expectedTf, tfAtt.getTermFrequency());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,46 @@
- match: { tokens.0.token: foo }

---
"delimited_term_freq":
- skip:
version: " - 2.9.99"
reason: "delimited_term_freq token filter was added in v2.10.0"
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_delimited_term_freq:
type: delimited_term_freq
delimiter: ^
- do:
indices.analyze:
index: test
body:
text: foo^3
tokenizer: keyword
filter: [my_delimited_term_freq]
attributes: termFrequency
explain: true
- length: { detail.tokenfilters: 1 }
- match: { detail.tokenfilters.0.tokens.0.token: foo }
- match: { detail.tokenfilters.0.tokens.0.termFrequency: 3 }

# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: foo|100
tokenizer: keyword
filter: [delimited_term_freq]
attributes: termFrequency
explain: true
- length: { detail.tokenfilters: 1 }
- match: { detail.tokenfilters.0.tokens.0.token: foo }
- match: { detail.tokenfilters.0.tokens.0.termFrequency: 100 }
---
"keep_filter":
- do:
indices.create:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase {
.put("czechstem", MovedToAnalysisCommon.class)
.put("decimaldigit", MovedToAnalysisCommon.class)
.put("delimitedpayload", MovedToAnalysisCommon.class)
.put("delimitedtermfrequency", MovedToAnalysisCommon.class)
.put("dictionarycompoundword", MovedToAnalysisCommon.class)
.put("edgengram", MovedToAnalysisCommon.class)
.put("elision", MovedToAnalysisCommon.class)
Expand Down Expand Up @@ -201,9 +202,6 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase {
.put("daterecognizer", Void.class)
// for token filters that generate bad offsets, which are now rejected since Lucene 7
.put("fixbrokenoffsets", Void.class)
// should we expose it, or maybe think about higher level integration of the
// fake term frequency feature (LUCENE-7854)
.put("delimitedtermfrequency", Void.class)
// LUCENE-8273: ProtectedTermFilterFactory allows analysis chains to skip
// particular token filters based on the attributes of the current token.
.put("protectedterm", Void.class)
Expand Down

0 comments on commit 7575ba5

Please sign in to comment.