From 33d197d0cded873a68b53f4d0fe6a54c12fdd19b Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 1 Aug 2014 05:27:54 -0400 Subject: [PATCH 1/3] Add keep_types for filtering by token type --- docs/reference/analysis/tokenfilters.asciidoc | 2 + .../keep-types-tokenfilter.asciidoc | 37 ++++++++++ .../index/analysis/AnalysisModule.java | 1 + .../index/analysis/KeepTypeFilterFactory.java | 69 +++++++++++++++++++ .../index/analysis/AnalysisFactoryTests.java | 3 +- .../analysis/KeepTypeFilterFactoryTests.java | 50 ++++++++++++++ 6 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc create mode 100644 src/main/java/org/elasticsearch/index/analysis/KeepTypeFilterFactory.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index ec46c26de8ec5..ba2ea71c55132 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -79,6 +79,8 @@ include::tokenfilters/delimited-payload-tokenfilter.asciidoc[] include::tokenfilters/keep-words-tokenfilter.asciidoc[] +include::tokenfilters/keep-types-tokenfilter.asciidoc[] + include::tokenfilters/classic-tokenfilter.asciidoc[] include::tokenfilters/apostrophe-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc new file mode 100644 index 0000000000000..5947beeb173cd --- /dev/null +++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc @@ -0,0 +1,37 @@ +[[analysis-keep-words-tokenfilter]] +=== Keep Types Token Filter + +A token filter of type `keep_types` that only keeps tokens with a token type +contained in a predefined set. + + +[float] +=== Options +[horizontal] +types:: a list of types to keep + + +[float] +=== Settings example + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "extract_numbers"] + }, + }, + "filter" : { + "extract_numbers" : { + "type" : "keep_types", + "types" : [ "" ] + }, + } + } + } +} +-------------------------------------------------- diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index 2b65c17ac7ded..b86d4b33f5f16 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -485,6 +485,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class); + tokenFiltersBindings.processTokenFilter("keep_types", KeepTypeFilterFactory.class); tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/KeepTypeFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/KeepTypeFilterFactory.java new file mode 100644 index 0000000000000..4fdf6be0ee4ec --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KeepTypeFilterFactory.java @@ -0,0 +1,69 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.TypeTokenFilter; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * A {@link TokenFilterFactory} for {@link TypeFilter}. This filter only + * keep tokens that are contained in the set configured via + * {@value #KEEP_TYPES_KEY} setting. + *

+ * Configuration options: + *

+ *

+ */ +@AnalysisSettingsRequired +public class KeepTypeFilterFactory extends AbstractTokenFilterFactory { + private final Set keepTypes; + private static final String KEEP_TYPES_KEY = "types"; + + @Inject + public KeepTypeFilterFactory(Index index, @IndexSettings Settings indexSettings, + Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + + final String[] arrayKeepTypes = settings.getAsArray(KEEP_TYPES_KEY, null); + if ((arrayKeepTypes == null)) { + throw new ElasticsearchIllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured"); + } + + this.keepTypes = new HashSet<>(Arrays.asList(arrayKeepTypes)); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new TypeTokenFilter(version, tokenStream, keepTypes, true); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java index aff1fd51452c5..2c728e787b93e 100644 --- a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java @@ -147,6 +147,7 @@ public void testTokenizers() { put("trim", TrimTokenFilterFactory.class); put("truncate", TruncateTokenFilterFactory.class); put("turkishlowercase", LowerCaseTokenFilterFactory.class); + put("type", KeepTypeFilterFactory.class); put("uppercase", UpperCaseTokenFilterFactory.class); put("worddelimiter", WordDelimiterTokenFilterFactory.class); @@ -168,8 +169,6 @@ public void testTokenizers() { put("removeduplicates", Void.class); // ??? put("tokenoffsetpayload", Void.class); - // like a stop filter but by token-type - put("type", Void.class); // puts the type into the payload put("typeaspayload", Void.class); }}; diff --git a/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java new file mode 100644 index 0000000000000..4088408514327 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java @@ -0,0 +1,50 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.test.ElasticsearchTokenStreamTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; + +import static org.hamcrest.Matchers.instanceOf; + +public class KeepTypeFilterFactoryTests extends ElasticsearchTokenStreamTestCase { + + @Test + public void testKeepTypes() throws IOException { + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.keep_numbers.type", "keep_types") + .putArray("index.analysis.filter.keep_numbers.types", new String[] {"", ""}) + .build(); + AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep_numbers"); + assertThat(tokenFilter, instanceOf(KeepTypeFilterFactory.class)); + String source = "Hello 123 world"; + String[] expected = new String[]{"123"}; + Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2}); + } +} From 936f7d52816a1c3585aa528a07e07a2731b6c357 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 15 Aug 2014 07:43:36 -0400 Subject: [PATCH 2/3] address comments in PR --- .../analysis/tokenfilters/keep-types-tokenfilter.asciidoc | 4 +++- .../java/org/elasticsearch/index/analysis/AnalysisModule.java | 2 +- ...KeepTypeFilterFactory.java => KeepTypesFilterFactory.java} | 4 ++-- .../elasticsearch/index/analysis/AnalysisFactoryTests.java | 2 +- .../index/analysis/KeepTypeFilterFactoryTests.java | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) rename src/main/java/org/elasticsearch/index/analysis/{KeepTypeFilterFactory.java => KeepTypesFilterFactory.java} (93%) diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc index 5947beeb173cd..8e504e4730635 100644 --- a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc @@ -1,6 +1,8 @@ -[[analysis-keep-words-tokenfilter]] +[[analysis-keep-types-tokenfilter]] === Keep Types Token Filter +coming[1.4.0] + A token filter of type `keep_types` that only keeps tokens with a token type contained in a predefined set. diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index b86d4b33f5f16..a4904330142c3 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -485,7 +485,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class); - tokenFiltersBindings.processTokenFilter("keep_types", KeepTypeFilterFactory.class); + tokenFiltersBindings.processTokenFilter("keep_types", KeepTypesFilterFactory.class); tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/KeepTypeFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java similarity index 93% rename from src/main/java/org/elasticsearch/index/analysis/KeepTypeFilterFactory.java rename to src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java index 4fdf6be0ee4ec..5c69a2b03f458 100644 --- a/src/main/java/org/elasticsearch/index/analysis/KeepTypeFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java @@ -45,12 +45,12 @@ * */ @AnalysisSettingsRequired -public class KeepTypeFilterFactory extends AbstractTokenFilterFactory { +public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { private final Set keepTypes; private static final String KEEP_TYPES_KEY = "types"; @Inject - public KeepTypeFilterFactory(Index index, @IndexSettings Settings indexSettings, + public KeepTypesFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java index 2c728e787b93e..895ac2c9211e4 100644 --- a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java @@ -147,7 +147,7 @@ public void testTokenizers() { put("trim", TrimTokenFilterFactory.class); put("truncate", TruncateTokenFilterFactory.class); put("turkishlowercase", LowerCaseTokenFilterFactory.class); - put("type", KeepTypeFilterFactory.class); + put("type", KeepTypesFilterFactory.class); put("uppercase", UpperCaseTokenFilterFactory.class); put("worddelimiter", WordDelimiterTokenFilterFactory.class); diff --git a/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java index 4088408514327..3b754dbbbed3b 100644 --- a/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java @@ -41,7 +41,7 @@ public void testKeepTypes() throws IOException { .build(); AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep_numbers"); - assertThat(tokenFilter, instanceOf(KeepTypeFilterFactory.class)); + assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); String source = "Hello 123 world"; String[] expected = new String[]{"123"}; Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(source)); From c86a07668607977ed215ccac055a8edf9a8e0459 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 15 Aug 2014 09:25:41 -0400 Subject: [PATCH 3/3] rename test for consistency --- ...FilterFactoryTests.java => KeepTypesFilterFactoryTests.java} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/test/java/org/elasticsearch/index/analysis/{KeepTypeFilterFactoryTests.java => KeepTypesFilterFactoryTests.java} (96%) diff --git a/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java similarity index 96% rename from src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java rename to src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java index 3b754dbbbed3b..425784d64daf5 100644 --- a/src/test/java/org/elasticsearch/index/analysis/KeepTypeFilterFactoryTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java @@ -31,7 +31,7 @@ import static org.hamcrest.Matchers.instanceOf; -public class KeepTypeFilterFactoryTests extends ElasticsearchTokenStreamTestCase { +public class KeepTypesFilterFactoryTests extends ElasticsearchTokenStreamTestCase { @Test public void testKeepTypes() throws IOException {