diff --git a/README.md b/README.md
index 6f77f102bfff7..75e410c3f2338 100644
--- a/README.md
+++ b/README.md
@@ -5,19 +5,25 @@ The Phonetic Analysis plugin integrates phonetic token filter analysis with elas
In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-phonetic/1.1.0`.
- ---------------------------------------------
- | Phonetic Analysis Plugin | ElasticSearch |
- ---------------------------------------------
- | master | 0.19 -> master |
- ---------------------------------------------
- | 1.1.0 | 0.19 -> master |
- ---------------------------------------------
- | 1.0.0 | 0.18 |
- ---------------------------------------------
+ -----------------------------------------------
+ | Phonetic Analysis Plugin | ElasticSearch |
+ -----------------------------------------------
+ | master | 0.19.2 -> master |
+ -----------------------------------------------
+ | 1.2.0 | 0.19.2 -> master |
+ -----------------------------------------------
+ | 1.1.0 | 0.19 |
+ -----------------------------------------------
+ | 1.0.0 | 0.18 |
+ -----------------------------------------------
-A `phonetic` token filter that can be configured with different `encoder` types: `metaphone`, `soundex`, `caverphone`, `refined_soundex`, `double_metaphone` (uses "commons codec":http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html).
+A `phonetic` token filter that can be configured with different `encoder` types:
+`metaphone`, `doublemetaphone`, `soundex`, `refinedsoundex`,
+`caverphone1`, `caverphone2`, `cologne`, `nysiis`,
+`koelnerphonetik`, `haasephonetik`
-The `replace` parameter (defaults to `true`) controls if the token processed should be replaced with the encoded one (set it to `true`), or added (set it to `false`).
+The `replace` parameter (defaults to `true`) controls if the token processed
+should be replaced with the encoded one (set it to `true`), or added (set it to `false`).
{
"index" : {
diff --git a/pom.xml b/pom.xml
index 2562c5c71e5d4..6e10c83fcec33 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
4.0.0
org.elasticsearch
elasticsearch-analysis-phonetic
- 1.2.0-SNAPSHOT
+ 1.2.0
jar
Phonetic Analysis for ElasticSearch
2009
@@ -31,7 +31,7 @@
- 0.19.0.RC3
+ 0.19.2
@@ -46,9 +46,9 @@
- commons-codec
- commons-codec
- 1.6
+ org.apache.lucene
+ lucene-analyzers-phonetic
+ 3.6.0
compile
@@ -95,7 +95,7 @@
org.apache.maven.plugins
maven-surefire-plugin
- 2.11
+ 2.12
**/*Tests.java
diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml
index e720308d68563..b15bcd1a39260 100644
--- a/src/main/assemblies/plugin.xml
+++ b/src/main/assemblies/plugin.xml
@@ -19,7 +19,8 @@
true
true
- commons-codec:commons-codec
+ org.apache.lucene:lucene-analyzers-phonetic
+ commons-codec:commons-codec
diff --git a/src/main/java/org/elasticsearch/index/analysis/DoubleMetaphoneFilter.java b/src/main/java/org/elasticsearch/index/analysis/DoubleMetaphoneFilter.java
deleted file mode 100644
index 113e7eebf88d9..0000000000000
--- a/src/main/java/org/elasticsearch/index/analysis/DoubleMetaphoneFilter.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to ElasticSearch and Shay Banon under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. ElasticSearch licenses this
- * file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.index.analysis;
-
-import org.apache.commons.codec.language.DoubleMetaphone;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-import java.io.IOException;
-import java.util.LinkedList;
-
-public final class DoubleMetaphoneFilter extends TokenFilter {
-
- private static final String TOKEN_TYPE = "DoubleMetaphone";
-
- private final LinkedList remainingTokens = new LinkedList();
- private final DoubleMetaphone encoder;
- private final boolean inject;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
-
- public DoubleMetaphoneFilter(TokenStream input, DoubleMetaphone encoder, boolean inject) {
- super(input);
- this.encoder = encoder;
- this.inject = inject;
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- for (; ; ) {
-
- if (!remainingTokens.isEmpty()) {
- // clearAttributes(); // not currently necessary
- restoreState(remainingTokens.removeFirst());
- return true;
- }
-
- if (!input.incrementToken()) return false;
-
- int len = termAtt.length();
- if (len == 0) return true; // pass through zero length terms
-
- int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement();
-
- String v = termAtt.toString();
- String primaryPhoneticValue = encoder.doubleMetaphone(v);
- String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
-
- // a flag to lazily save state if needed... this avoids a save/restore when only
- // one token will be generated.
- boolean saveState = inject;
-
- if (primaryPhoneticValue != null && primaryPhoneticValue.length() > 0 && !primaryPhoneticValue.equals(v)) {
- if (saveState) {
- remainingTokens.addLast(captureState());
- }
- posAtt.setPositionIncrement(firstAlternativeIncrement);
- firstAlternativeIncrement = 0;
- termAtt.setEmpty().append(primaryPhoneticValue);
- saveState = true;
- }
-
- if (alternatePhoneticValue != null && alternatePhoneticValue.length() > 0
- && !alternatePhoneticValue.equals(primaryPhoneticValue)
- && !primaryPhoneticValue.equals(v)) {
- if (saveState) {
- remainingTokens.addLast(captureState());
- saveState = false;
- }
- posAtt.setPositionIncrement(firstAlternativeIncrement);
- termAtt.setEmpty().append(alternatePhoneticValue);
- saveState = true;
- }
-
- // Just one token to return, so no need to capture/restore
- // any state, simply return it.
- if (remainingTokens.isEmpty()) {
- return true;
- }
-
- if (saveState) {
- remainingTokens.addLast(captureState());
- }
- }
- }
-
- @Override
- public void reset() throws IOException {
- input.reset();
- remainingTokens.clear();
- }
-}
diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneticFilter.java b/src/main/java/org/elasticsearch/index/analysis/PhoneticFilter.java
deleted file mode 100644
index 1ba1fcdc87d3a..0000000000000
--- a/src/main/java/org/elasticsearch/index/analysis/PhoneticFilter.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to Elastic Search and Shay Banon under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. Elastic Search licenses this
- * file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.index.analysis;
-
-import org.apache.commons.codec.Encoder;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-import java.io.IOException;
-
-/**
- * Create tokens for phonetic matches. See:
- * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
- */
-// LUCENE MONITOR - No need for it in Lucene 3.6
-public class PhoneticFilter extends TokenFilter {
-
- protected boolean inject = true;
- protected Encoder encoder = null;
- protected String name = null;
-
- protected State save = null;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
-
- public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
- super(in);
- this.encoder = encoder;
- this.name = name;
- this.inject = inject;
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (save != null) {
- // clearAttributes(); // not currently necessary
- restoreState(save);
- save = null;
- return true;
- }
-
- if (!input.incrementToken()) return false;
-
- // pass through zero-length terms
- if (termAtt.length() == 0) return true;
-
- String value = termAtt.toString();
- String phonetic = null;
- try {
- String v = encoder.encode(value).toString();
- if (v.length() > 0 && !value.equals(v)) phonetic = v;
- } catch (Exception ignored) {
- } // just use the direct text
-
- if (phonetic == null) return true;
-
- if (!inject) {
- // just modify this token
- termAtt.setEmpty().append(phonetic);
- return true;
- }
-
- // We need to return both the original and the phonetic tokens.
- // to avoid a orig=captureState() change_to_phonetic() saved=captureState() restoreState(orig)
- // we return the phonetic alternative first
-
- int origOffset = posAtt.getPositionIncrement();
- posAtt.setPositionIncrement(0);
- save = captureState();
-
- posAtt.setPositionIncrement(origOffset);
- termAtt.setEmpty().append(phonetic);
- return true;
- }
-
- @Override
- public void reset() throws IOException {
- input.reset();
- save = null;
- }
-}
\ No newline at end of file
diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java
index b033776f65a04..dc491e587ffcd 100644
--- a/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java
@@ -16,87 +16,120 @@
* specific language governing permissions and limitations
* under the License.
*/
-
package org.elasticsearch.index.analysis;
+import java.util.Arrays;
+import java.util.HashSet;
import org.apache.commons.codec.Encoder;
-import org.apache.commons.codec.language.*;
-import org.apache.commons.codec.language.bm.BeiderMorseEncoder;
+import org.apache.commons.codec.language.Caverphone1;
+import org.apache.commons.codec.language.Caverphone2;
+import org.apache.commons.codec.language.ColognePhonetic;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.RefinedSoundex;
+import org.apache.commons.codec.language.Soundex;
+import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.NameType;
+import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.commons.codec.language.bm.RuleType;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
+import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
+import org.apache.lucene.analysis.phonetic.PhoneticFilter;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
+import org.elasticsearch.index.analysis.phonetic.HaasePhonetik;
+import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
+import org.elasticsearch.index.analysis.phonetic.Nysiis;
import org.elasticsearch.index.settings.IndexSettings;
/**
*
*/
-@AnalysisSettingsRequired
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
private final Encoder encoder;
-
private final boolean replace;
+ private int maxcodelength;
+ private String[] languageset;
+ private NameType nametype;
+ private RuleType ruletype;
@Inject
public PhoneticTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
+ this.languageset = null;
+ this.nametype = null;
+ this.ruletype = null;
+ this.maxcodelength = 0;
this.replace = settings.getAsBoolean("replace", true);
- String encoder = settings.get("encoder");
- if (encoder == null) {
- throw new ElasticSearchIllegalArgumentException("encoder must be set on phonetic token filter");
- }
- if ("metaphone".equalsIgnoreCase(encoder)) {
+ // weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
+ String encodername = settings.get("encoder", "metaphone");
+ if ("metaphone".equalsIgnoreCase(encodername)) {
this.encoder = new Metaphone();
- } else if ("soundex".equalsIgnoreCase(encoder)) {
+ } else if ("soundex".equalsIgnoreCase(encodername)) {
this.encoder = new Soundex();
- } else if ("caverphone1".equalsIgnoreCase(encoder)) {
+ } else if ("caverphone1".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone1();
- } else if ("caverphone2".equalsIgnoreCase(encoder)) {
+ } else if ("caverphone2".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
- } else if ("caverphone".equalsIgnoreCase(encoder)) {
+ } else if ("caverphone".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
- } else if ("refined_soundex".equalsIgnoreCase(encoder) || "refinedSoundex".equalsIgnoreCase(encoder)) {
+ } else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) {
this.encoder = new RefinedSoundex();
- } else if ("cologne".equalsIgnoreCase(encoder)) {
+ } else if ("cologne".equalsIgnoreCase(encodername)) {
this.encoder = new ColognePhonetic();
- } else if ("double_metaphone".equalsIgnoreCase(encoder) || "doubleMetaphone".equalsIgnoreCase(encoder)) {
- DoubleMetaphone doubleMetaphone = new DoubleMetaphone();
- doubleMetaphone.setMaxCodeLen(settings.getAsInt("max_code_len", doubleMetaphone.getMaxCodeLen()));
- this.encoder = doubleMetaphone;
- } else if ("bm".equalsIgnoreCase(encoder) || "beider_morse".equalsIgnoreCase(encoder)) {
- BeiderMorseEncoder bm = new BeiderMorseEncoder();
+ } else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) {
+ this.encoder = null;
+ this.maxcodelength = settings.getAsInt("max_code_len", 4);
+ } else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) {
+ this.encoder = null;
+ this.languageset = settings.getAsArray("languageset");
String ruleType = settings.get("rule_type", "approx");
if ("approx".equalsIgnoreCase(ruleType)) {
- bm.setRuleType(RuleType.APPROX);
+ ruletype = RuleType.APPROX;
} else if ("exact".equalsIgnoreCase(ruleType)) {
- bm.setRuleType(RuleType.EXACT);
+ ruletype = RuleType.EXACT;
} else {
throw new ElasticSearchIllegalArgumentException("No matching rule type [" + ruleType + "] for beider morse encoder");
}
String nameType = settings.get("name_type", "generic");
if ("GENERIC".equalsIgnoreCase(nameType)) {
- bm.setNameType(NameType.GENERIC);
+ nametype = NameType.GENERIC;
} else if ("ASHKENAZI".equalsIgnoreCase(nameType)) {
- bm.setNameType(NameType.ASHKENAZI);
+ nametype = NameType.ASHKENAZI;
} else if ("SEPHARDIC".equalsIgnoreCase(nameType)) {
- bm.setNameType(NameType.SEPHARDIC);
+ nametype = NameType.SEPHARDIC;
}
- this.encoder = bm;
+ } else if ("koelnerphonetik".equalsIgnoreCase(encodername)) {
+ this.encoder = new KoelnerPhonetik();
+ } else if ("haasephonetik".equalsIgnoreCase(encodername)) {
+ this.encoder = new HaasePhonetik();
+ } else if ("nysiis".equalsIgnoreCase(encodername)) {
+ this.encoder = new Nysiis();
} else {
- throw new ElasticSearchIllegalArgumentException("unknown encoder [" + encoder + "] for phonetic token filter");
+ throw new ElasticSearchIllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
- if (encoder instanceof DoubleMetaphone) {
- return new DoubleMetaphoneFilter(tokenStream, (DoubleMetaphone) encoder, !replace);
+ if (encoder == null) {
+ if (ruletype != null && nametype != null) {
+ if (languageset != null) {
+ final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset)));
+ return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
+ }
+ return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
+ }
+ if (maxcodelength > 0) {
+ return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
+ }
+ } else {
+ return new PhoneticFilter(tokenStream, encoder, !replace);
}
- return new org.elasticsearch.index.analysis.PhoneticFilter(tokenStream, encoder, name(), !replace);
+ throw new ElasticSearchIllegalArgumentException("encoder error");
}
}
\ No newline at end of file
diff --git a/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java b/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java
new file mode 100644
index 0000000000000..7526f205cda2c
--- /dev/null
+++ b/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis.phonetic;
+
+/**
+ * Geänderter Algorithmus aus der Matching Toolbox von Rainer Schnell
+ * Java-Programmierung von Jörg Reiher
+ *
+ * Die Kölner Phonetik wurde für den Einsatz in Namensdatenbanken wie
+ * der Verwaltung eines Krankenhauses durch Martin Haase (Institut für
+ * Sprachwissenschaft, Universität zu Köln) und Kai Heitmann (Insitut für
+ * medizinische Statistik, Informatik und Epidemiologie, Köln) überarbeitet.
+ * M. Haase und K. Heitmann. Die Erweiterte Kölner Phonetik. 526, 2000.
+ *
+ * nach: Martin Wilz, Aspekte der Kodierung phonetischer Ähnlichkeiten
+ * in deutschen Eigennamen, Magisterarbeit.
+ * http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
+ *
+ * @author Jörg Prante
+ */
+public class HaasePhonetik extends KoelnerPhonetik {
+
+ private final static String[] HAASE_VARIATIONS_PATTERNS = {"OWN", "RB", "WSK", "A$", "O$", "SCH",
+ "GLI", "EAU$", "^CH", "AUX", "EUX", "ILLE"};
+ private final static String[] HAASE_VARIATIONS_REPLACEMENTS = {"AUN", "RW", "RSK", "AR", "OW", "CH",
+ "LI", "O", "SCH", "O", "O", "I"};
+
+ /**
+ *
+ * @return
+ */
+ @Override
+ protected String[] getPatterns() {
+ return HAASE_VARIATIONS_PATTERNS;
+ }
+
+ /**
+ *
+ * @return
+ */
+ @Override
+ protected String[] getReplacements() {
+ return HAASE_VARIATIONS_REPLACEMENTS;
+ }
+
+ /**
+ *
+ * @return
+ */
+ @Override
+ protected char getCode() {
+ return '9';
+ }
+}
diff --git a/src/main/java/org/elasticsearch/index/analysis/phonetic/KoelnerPhonetik.java b/src/main/java/org/elasticsearch/index/analysis/phonetic/KoelnerPhonetik.java
new file mode 100644
index 0000000000000..3086a5aeda73c
--- /dev/null
+++ b/src/main/java/org/elasticsearch/index/analysis/phonetic/KoelnerPhonetik.java
@@ -0,0 +1,327 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis.phonetic;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Kölner Phonetik
+ *
+ * H.J. Postel, Die Kölner Phonetik. Ein Verfahren zu Identifizierung
+ * von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten 19 (1969), 925-931
+ *
+ * Algorithmus aus der Matching Toolbox von Rainer Schnell
+ * Java-Programmierung von Jörg Reiher
+ *
+ * mit Änderungen von Jörg Prante
+ *
+ */
+public class KoelnerPhonetik implements StringEncoder {
+
+ private static final String[] POSTEL_VARIATIONS_PATTERNS = {"AUN", "OWN", "RB", "RW", "WSK", "RSK"};
+ private static final String[] POSTEL_VARIATIONS_REPLACEMENTS = {"OWN", "AUN", "RW", "RB", "RSK", "WSK"};
+ private Pattern[] variationsPatterns;
+ private boolean primary = false;
+ private final Set csz = new HashSet(Arrays.asList(
+ 'C', 'S', 'Z'));
+ private final Set ckq = new HashSet(Arrays.asList(
+ 'C', 'K', 'Q'));
+ private final Set aouhkxq = new HashSet(Arrays.asList(
+ 'A', 'O', 'U', 'H', 'K', 'X', 'Q'));
+ private final Set ahkloqrux = new HashSet(Arrays.asList(
+ 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'));
+
+ /**
+ * Constructor for Kölner Phonetik
+ */
+ public KoelnerPhonetik() {
+ init();
+ }
+
+ /**
+ *
+ * @param useOnlyPrimaryCode
+ */
+ public KoelnerPhonetik(boolean useOnlyPrimaryCode) {
+ this();
+ this.primary = useOnlyPrimaryCode;
+ }
+
+ /**
+ * Get variation patterns
+ *
+ * @return string array of variations
+ */
+ protected String[] getPatterns() {
+ return POSTEL_VARIATIONS_PATTERNS;
+ }
+
+ /**
+ *
+ * @return
+ */
+ protected String[] getReplacements() {
+ return POSTEL_VARIATIONS_REPLACEMENTS;
+ }
+
+ /**
+ *
+ * @return
+ */
+ protected char getCode() {
+ return '0';
+ }
+
+ /**
+ *
+ * @param o1
+ * @param o2
+ * @return
+ */
+ public double getRelativeValue(Object o1, Object o2) {
+ String[] kopho1 = code(expandUmlauts(o1.toString().toUpperCase(Locale.GERMANY)));
+ String[] kopho2 = code(expandUmlauts(o2.toString().toUpperCase(Locale.GERMANY)));
+ for (int i = 0; i < kopho1.length; i++) {
+ for (int ii = 0; ii < kopho2.length; ii++) {
+ if (kopho1[i].equals(kopho2[ii])) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+ }
+
+ @Override
+ public Object encode(Object str) throws EncoderException {
+ return encode((String) str);
+ }
+
+ @Override
+ public String encode(String str) throws EncoderException {
+ if (str == null) return null;
+ String[] s = code(str.toString());
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < s.length; i++) {
+ sb.append(s[i]);
+ if (i < s.length - 1) {
+ sb.append('_');
+ }
+ }
+ return sb.toString();
+ }
+
+
+ private void init() {
+ this.variationsPatterns = new Pattern[getPatterns().length];
+ for (int i = 0; i < getPatterns().length; i++) {
+ this.variationsPatterns[i] = Pattern.compile(getPatterns()[i]);
+ }
+ }
+
+ private String[] code(String str) {
+ List parts = partition(str);
+ String[] codes = new String[parts.size()];
+ int i = 0;
+ for (String s : parts) {
+ codes[i++] = substitute(s);
+ }
+ return codes;
+ }
+
+ private List partition(String str) {
+ String primaryForm = str;
+ List parts = new ArrayList();
+ parts.add(primaryForm.replaceAll("[^\\p{L}\\p{N}]", ""));
+ if (!primary) {
+ List tmpParts = new ArrayList();
+ tmpParts.addAll((Arrays.asList(str.split("[\\p{Z}\\p{C}\\p{P}]"))));
+ int numberOfParts = tmpParts.size();
+ while (tmpParts.size() > 0) {
+ StringBuilder part = new StringBuilder();
+ for (int i = 0; i < tmpParts.size(); i++) {
+ part.append(tmpParts.get(i));
+ if (!(i + 1 == numberOfParts)) {
+ parts.add(part.toString());
+ }
+ }
+ tmpParts.remove(0);
+ }
+ }
+ List variations = new ArrayList();
+ for (int i = 0; i < parts.size(); i++) {
+ List variation = getVariations(parts.get(i));
+ if (variation != null) {
+ variations.addAll(variation);
+ }
+ }
+ return variations;
+ }
+
+ private List getVariations(String str) {
+ int position = 0;
+ List variations = new ArrayList();
+ variations.add("");
+ while (position < str.length()) {
+ int i = 0;
+ int substPos = -1;
+ while (substPos < position && i < getPatterns().length) {
+ Matcher m = variationsPatterns[i].matcher(str);
+ while (substPos < position && m.find()) {
+ substPos = m.start();
+ }
+ i++;
+ }
+ if (substPos >= position) {
+ i--;
+ List varNew = new ArrayList();
+ String prevPart = str.substring(position, substPos);
+ for (int ii = 0; ii < variations.size(); ii++) {
+ String tmp = variations.get(ii);
+ varNew.add(tmp.concat(prevPart + getReplacements()[i]));
+ variations.set(ii, variations.get(ii) + prevPart + getPatterns()[i]);
+ }
+ variations.addAll(varNew);
+ position = substPos + getPatterns()[i].length();
+ } else {
+ for (int ii = 0; ii < variations.size(); ii++) {
+ variations.set(ii, variations.get(ii) + str.substring(position, str.length()));
+ }
+ position = str.length();
+ }
+ }
+ return variations;
+ }
+
+ private String substitute(String str) {
+ String s = expandUmlauts(str.toUpperCase(Locale.GERMAN));
+ s = removeSequences(s);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char current = s.charAt(i);
+ char next = i + 1 < s.length() ? s.charAt(i + 1) : '_';
+ char prev = i > 0 ? s.charAt(i - 1) : '_';
+ switch (current) {
+ case 'A':
+ case 'E':
+ case 'I':
+ case 'J':
+ case 'Y':
+ case 'O':
+ case 'U':
+ if (i == 0 || ((i == 1) && prev == 'H')) {
+ sb.append(getCode());
+ }
+ break;
+ case 'P':
+ sb.append(next == 'H' ? "33" : '1');
+ break;
+ case 'B':
+ sb.append('1');
+ break;
+ case 'D':
+ case 'T':
+ sb.append(csz.contains(next) ? '8' : '2');
+ break;
+ case 'F':
+ case 'V':
+ case 'W':
+ sb.append('3');
+ break;
+ case 'G':
+ case 'K':
+ case 'Q':
+ sb.append('4');
+ break;
+ case 'C':
+ if (i == 0) {
+ sb.append(ahkloqrux.contains(next) ? '4' : '8');
+ } else {
+ sb.append(aouhkxq.contains(next) ? '4' : '8');
+ }
+ if (sb.length() >= 2 && sb.charAt(sb.length() - 2) == '8') {
+ sb.setCharAt(sb.length() - 1, '8');
+ }
+ break;
+ case 'X':
+ sb.append(i < 1 || !ckq.contains(prev) ? "48" : '8');
+ break;
+ case 'L':
+ sb.append('5');
+ break;
+ case 'M':
+ case 'N':
+ sb.append('6');
+ break;
+ case 'R':
+ sb.append('7');
+ break;
+ case 'S':
+ case 'Z':
+ sb.append('8');
+ break;
+ case 'H':
+ break;
+ }
+ }
+ s = sb.toString();
+ s = removeSequences(s);
+ return s;
+ }
+
+ /**
+ *
+ * @param str
+ * @return
+ */
+ private String expandUmlauts(String str) {
+ return str.replaceAll("\u00C4", "AE").replaceAll("\u00D6", "OE").replaceAll("\u00DC", "UE");
+ }
+
+ /**
+ *
+ * @param str
+ * @return
+ */
+ private String removeSequences(String str) {
+ if (str == null || str.length() == 0) {
+ return "";
+ }
+ int i = 0, j = 0;
+ StringBuilder sb = new StringBuilder().append(str.charAt(i++));
+ char c;
+ while (i < str.length()) {
+ c = str.charAt(i);
+ if (c != sb.charAt(j)) {
+ sb.append(c);
+ j++;
+ }
+ i++;
+ }
+ return sb.toString();
+ }
+}
diff --git a/src/main/java/org/elasticsearch/index/analysis/phonetic/Nysiis.java b/src/main/java/org/elasticsearch/index/analysis/phonetic/Nysiis.java
new file mode 100644
index 0000000000000..6275b84677ace
--- /dev/null
+++ b/src/main/java/org/elasticsearch/index/analysis/phonetic/Nysiis.java
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.elasticsearch.index.analysis.phonetic;
+
+import java.util.regex.Pattern;
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ *
+ * Taken from commons-codec trunk (unreleased yet)
+ *
+ * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate
+ * similar names, but can also be used as a general purpose scheme to find word
+ * with similar phonemes.
+ *
+ * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex
+ * algorithm.
+ *
+ * Algorithm description:
+ *
+ * 1. Transcode first characters of name
+ * 1a. MAC -> MCC
+ * 1b. KN -> NN
+ * 1c. K -> C
+ * 1d. PH -> FF
+ * 1e. PF -> FF
+ * 1f. SCH -> SSS
+ * 2. Transcode last characters of name
+ * 2a. EE, IE -> Y
+ * 2b. DT,RT,RD,NT,ND -> D
+ * 3. First character of key = first character of name
+ * 4. Transcode remaining characters by following these rules, incrementing by one character each time
+ * 4a. EV -> AF else A,E,I,O,U -> A
+ * 4b. Q -> G
+ * 4c. Z -> S
+ * 4d. M -> N
+ * 4e. KN -> N else K -> C
+ * 4f. SCH -> SSS
+ * 4g. PH -> FF
+ * 4h. H -> If previous or next is nonvowel, previous
+ * 4i. W -> If previous is vowel, previous
+ * 4j. Add current to key if current != last key character
+ * 5. If last character is S, remove it
+ * 6. If last characters are AY, replace with Y
+ * 7. If last character is A, remove it
+ * 8. Collapse all strings of repeated characters
+ * 9. Add original first character of name as first character of key
+ *
+ *
+ * @see NYSIIS on Wikipedia
+ * @see NYSIIS on dropby.com
+ *
+ */
+public class Nysiis implements StringEncoder {
+
+ private static final char[] CHARS_A = new char[]{'A'};
+ private static final char[] CHARS_AF = new char[]{'A', 'F'};
+ private static final char[] CHARS_C = new char[]{'C'};
+ private static final char[] CHARS_FF = new char[]{'F', 'F'};
+ private static final char[] CHARS_G = new char[]{'G'};
+ private static final char[] CHARS_N = new char[]{'N'};
+ private static final char[] CHARS_NN = new char[]{'N', 'N'};
+ private static final char[] CHARS_S = new char[]{'S'};
+ private static final char[] CHARS_SSS = new char[]{'S', 'S', 'S'};
+ private static final Pattern PAT_MAC = Pattern.compile("^MAC");
+ private static final Pattern PAT_KN = Pattern.compile("^KN");
+ private static final Pattern PAT_K = Pattern.compile("^K");
+ private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
+ private static final Pattern PAT_SCH = Pattern.compile("^SCH");
+ private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
+ private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
+ private static final char SPACE = ' ';
+ private static final int TRUE_LENGTH = 6;
+
+ /**
+ * Tests if the given character is a vowel.
+ *
+ * @param c the character to test
+ * @return {@code true} if the character is a vowel, {@code false} otherwise
+ */
+ private static boolean isVowel(final char c) {
+ return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
+ }
+
+ /**
+ * Transcodes the remaining parts of the String. The method operates on a
+ * sliding window, looking at 4 characters at a time: [i-1, i, i+1, i+2].
+ *
+ * @param prev the previous character
+ * @param curr the current character
+ * @param next the next character
+ * @param aNext the after next character
+ * @return a transcoded array of characters, starting from the current
+ * position
+ */
+ private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
+ // 1. EV -> AF
+ if (curr == 'E' && next == 'V') {
+ return CHARS_AF;
+ }
+
+ // A, E, I, O, U -> A
+ if (isVowel(curr)) {
+ return CHARS_A;
+ }
+
+ // 2. Q -> G, Z -> S, M -> N
+ if (curr == 'Q') {
+ return CHARS_G;
+ } else if (curr == 'Z') {
+ return CHARS_S;
+ } else if (curr == 'M') {
+ return CHARS_N;
+ }
+
+ // 3. KN -> NN else K -> C
+ if (curr == 'K') {
+ if (next == 'N') {
+ return CHARS_NN;
+ } else {
+ return CHARS_C;
+ }
+ }
+
+ // 4. SCH -> SSS
+ if (curr == 'S' && next == 'C' && aNext == 'H') {
+ return CHARS_SSS;
+ }
+
+ // PH -> FF
+ if (curr == 'P' && next == 'H') {
+ return CHARS_FF;
+ }
+
+ // 5. H -> If previous or next is a non vowel, previous.
+ if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
+ return new char[]{prev};
+ }
+
+ // 6. W -> If previous is vowel, previous.
+ if (curr == 'W' && isVowel(prev)) {
+ return new char[]{prev};
+ }
+
+ return new char[]{curr};
+ }
+ /**
+ * Indicates the strict mode.
+ */
+ private final boolean strict;
+
+ /**
+ * Creates an instance of the {@link Nysiis} encoder with strict mode
+ * (original form), i.e. encoded strings have a maximum length of 6.
+ */
+ public Nysiis() {
+ this(true);
+ }
+
+ /**
+ * Create an instance of the {@link Nysiis} encoder with the specified
+ * strict mode:
+ *
+ * - {@code true}: encoded strings have a maximum length of 6
- {@code false}:
+ * encoded strings may have arbitrary length
+ *
+ * @param strict the strict mode
+ */
+ public Nysiis(final boolean strict) {
+ this.strict = strict;
+ }
+
+ /**
+ * Encodes an Object using the NYSIIS algorithm. This method is provided in
+ * order to satisfy the requirements of the Encoder interface, and will
+ * throw an {@link EncoderException} if the supplied object is not of type
+ * {@link String}.
+ *
+ * @param obj Object to encode
+ * @return An object (or a {@link String}) containing the NYSIIS code which
+ * corresponds to the given String.
+ * @throws EncoderException if the parameter supplied is not of a {@link String}
+ * @throws IllegalArgumentException if a character is not mapped
+ */
+ @Override
+ public Object encode(Object obj) throws EncoderException {
+ if (!(obj instanceof String)) {
+ throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
+ }
+ return this.nysiis((String) obj);
+ }
+
+ /**
+ * Encodes a String using the NYSIIS algorithm.
+ *
+ * @param str A String object to encode
+ * @return A Nysiis code corresponding to the String supplied
+ * @throws IllegalArgumentException if a character is not mapped
+ */
+ @Override
+ public String encode(String str) {
+ return this.nysiis(str);
+ }
+
+ /**
+ * Indicates the strict mode for this {@link Nysiis} encoder.
+ *
+ * @return {@code true} if the encoder is configured for strict mode, {@code false}
+ * otherwise
+ */
+ public boolean isStrict() {
+ return this.strict;
+ }
+
+ /**
+ * Retrieves the NYSIIS code for a given String object.
+ *
+ * @param str String to encode using the NYSIIS algorithm
+ * @return A NYSIIS code for the String supplied
+ */
+ public String nysiis(String str) {
+ if (str == null) {
+ return null;
+ }
+
+ // Use the same clean rules as Soundex
+ str = clean(str);
+
+ if (str.length() == 0) {
+ return str;
+ }
+
+ // Translate first characters of name:
+ // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
+ str = PAT_MAC.matcher(str).replaceFirst("MCC");
+ str = PAT_KN.matcher(str).replaceFirst("NN");
+ str = PAT_K.matcher(str).replaceFirst("C");
+ str = PAT_PH_PF.matcher(str).replaceFirst("FF");
+ str = PAT_SCH.matcher(str).replaceFirst("SSS");
+
+ // Translate last characters of name:
+ // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
+ str = PAT_EE_IE.matcher(str).replaceFirst("Y");
+ str = PAT_DT_ETC.matcher(str).replaceFirst("D");
+
+ // First character of key = first character of name.
+ StringBuffer key = new StringBuffer(str.length());
+ key.append(str.charAt(0));
+
+ // Transcode remaining characters, incrementing by one character each time
+ final char[] chars = str.toCharArray();
+ final int len = chars.length;
+
+ for (int i = 1; i < len; i++) {
+ final char next = i < len - 1 ? chars[i + 1] : SPACE;
+ final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
+ final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
+ System.arraycopy(transcoded, 0, chars, i, transcoded.length);
+
+ // only append the current char to the key if it is different from the last one
+ if (chars[i] != chars[i - 1]) {
+ key.append(chars[i]);
+ }
+ }
+
+ if (key.length() > 1) {
+ char lastChar = key.charAt(key.length() - 1);
+
+ // If last character is S, remove it.
+ if (lastChar == 'S') {
+ key.deleteCharAt(key.length() - 1);
+ lastChar = key.charAt(key.length() - 1);
+ }
+
+ if (key.length() > 2) {
+ final char last2Char = key.charAt(key.length() - 2);
+ // If last characters are AY, replace with Y.
+ if (last2Char == 'A' && lastChar == 'Y') {
+ key.deleteCharAt(key.length() - 2);
+ }
+ }
+
+ // If last character is A, remove it.
+ if (lastChar == 'A') {
+ key.deleteCharAt(key.length() - 1);
+ }
+ }
+
+ final String string = key.toString();
+ return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
+ }
+
+ static String clean(String str) {
+ if (str == null || str.length() == 0) {
+ return str;
+ }
+ int len = str.length();
+ char[] chars = new char[len];
+ int count = 0;
+ for (int i = 0; i < len; i++) {
+ if (Character.isLetter(str.charAt(i))) {
+ chars[count++] = str.charAt(i);
+ }
+ }
+ if (count == len) {
+ return str.toUpperCase(java.util.Locale.ENGLISH);
+ }
+ return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java
index 11f0799781ba3..a5aef2de90cef 100644
--- a/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java
@@ -2,6 +2,8 @@
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
+import static org.elasticsearch.common.settings.ImmutableSettings.*;
+import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
@@ -10,29 +12,38 @@
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
+import org.hamcrest.MatcherAssert;
+import static org.hamcrest.Matchers.*;
import org.testng.annotations.Test;
-import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
-
/**
*/
public class SimplePhoneticAnalysisTests {
@Test
- public void testDefaultsIcuAnalysis() {
+ public void testPhoneticTokenFilterFactory() {
+ Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml").build();
+ AnalysisService analysisService = testSimpleConfiguration(settings);
+ TokenFilterFactory standardfilterFactory = analysisService.tokenFilter("standard");
+ System.err.println("standard filterfactory = " + standardfilterFactory);
+ TokenFilterFactory filterFactory = analysisService.tokenFilter("phonetic");
+ System.err.println("filterfactory = " + filterFactory);
+ MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
+ }
+
+ private AnalysisService testSimpleConfiguration(Settings settings) {
Index index = new Index("test");
- Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector();
+ Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
+ new EnvironmentModule(new Environment(settings)),
+ new IndicesAnalysisModule()).createInjector();
Injector injector = new ModulesBuilder().add(
- new IndexSettingsModule(index, EMPTY_SETTINGS),
+ new IndexSettingsModule(index, settings),
new IndexNameModule(index),
- new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new PhoneticAnalysisBinderProcessor()))
- .createChildInjector(parentInjector);
+ new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))
+ .addProcessor(new PhoneticAnalysisBinderProcessor())).createChildInjector(parentInjector);
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
-
- // need to create one with encoder...
- //TokenFilterFactory tokenFilterFactory = analysisService.tokenFilter("phonetic");
- //MatcherAssert.assertThat(tokenFilterFactory, Matchers.instanceOf(PhoneticTokenFilterFactory.class));
+ return analysisService;
}
}
diff --git a/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml b/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml
new file mode 100644
index 0000000000000..41a4e3fc59fdc
--- /dev/null
+++ b/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml
@@ -0,0 +1,30 @@
+index:
+ analysis:
+ filter:
+ doublemetaphonefilter:
+ type: phonetic
+ encoder: doublemetaphone
+ metaphonefilter:
+ type: phonetic
+ encoder: metaphone
+ soundexfilter:
+ type: phonetic
+ encoder: soundex
+ refinedsoundexfilter:
+ type: phonetic
+ encoder: refinedsoundex
+ caverphonefilter:
+ type: phonetic
+ encoder: caverphone
+ beidermorsefilter:
+ type: phonetic
+ encoder: beidermorse
+ koelnerphonetikfilter:
+ type: phonetic
+ encoder: koelnerphonetik
+ haasephonetikfilter:
+ type: phonetic
+ encoder: haasephonetik
+ nysiisfilter:
+ type: phonetic
+ encoder: nysiis