Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/reference/analysis/tokenfilters.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ include::tokenfilters/delimited-payload-tokenfilter.asciidoc[]

include::tokenfilters/keep-words-tokenfilter.asciidoc[]

include::tokenfilters/keep-types-tokenfilter.asciidoc[]

include::tokenfilters/classic-tokenfilter.asciidoc[]

include::tokenfilters/apostrophe-tokenfilter.asciidoc[]
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
[[analysis-keep-types-tokenfilter]]
=== Keep Types Token Filter

coming[1.4.0]

A token filter of type `keep_types` that only keeps tokens with a token type
contained in a predefined set.


[float]
=== Options
[horizontal]
types:: a list of types to keep


[float]
=== Settings example

[source,js]
--------------------------------------------------
{
"index" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "extract_numbers"]
},
},
"filter" : {
"extract_numbers" : {
"type" : "keep_types",
"types" : [ "<NUM>" ]
},
}
}
}
}
--------------------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
tokenFiltersBindings.processTokenFilter("keep_types", KeepTypesFilterFactory.class);

tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.TypeTokenFilter;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
* A {@link TokenFilterFactory} for {@link TypeFilter}. This filter only
* keep tokens that are contained in the set configured via
* {@value #KEEP_TYPES_KEY} setting.
* <p/>
* Configuration options:
* <p/>
* <ul>
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens to keep.</li>
* </ul>
*/
@AnalysisSettingsRequired
public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
private final Set<String> keepTypes;
private static final String KEEP_TYPES_KEY = "types";

@Inject
public KeepTypesFilterFactory(Index index, @IndexSettings Settings indexSettings,
Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);

final String[] arrayKeepTypes = settings.getAsArray(KEEP_TYPES_KEY, null);
if ((arrayKeepTypes == null)) {
throw new ElasticsearchIllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
}

this.keepTypes = new HashSet<>(Arrays.asList(arrayKeepTypes));
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new TypeTokenFilter(version, tokenStream, keepTypes, true);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ public void testTokenizers() {
put("trim", TrimTokenFilterFactory.class);
put("truncate", TruncateTokenFilterFactory.class);
put("turkishlowercase", LowerCaseTokenFilterFactory.class);
put("type", KeepTypesFilterFactory.class);
put("uppercase", UpperCaseTokenFilterFactory.class);
put("worddelimiter", WordDelimiterTokenFilterFactory.class);

Expand All @@ -168,8 +169,6 @@ public void testTokenizers() {
put("removeduplicates", Void.class);
// ???
put("tokenoffsetpayload", Void.class);
// like a stop filter but by token-type
put("type", Void.class);
// puts the type into the payload
put("typeaspayload", Void.class);
}};
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.ElasticsearchTokenStreamTestCase;
import org.junit.Test;

import java.io.IOException;
import java.io.StringReader;

import static org.hamcrest.Matchers.instanceOf;

public class KeepTypesFilterFactoryTests extends ElasticsearchTokenStreamTestCase {

@Test
public void testKeepTypes() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.keep_numbers.type", "keep_types")
.putArray("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep_numbers");
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
String source = "Hello 123 world";
String[] expected = new String[]{"123"};
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
}
}