Skip to content

Commit

Permalink
Merge branch 'master' of github.com:medcl/elasticsearch-analysis-ik
Browse files Browse the repository at this point in the history
  • Loading branch information
medcl committed Apr 9, 2018
2 parents 816b8dd + 7028b9e commit 21a859a
Showing 1 changed file with 30 additions and 252 deletions.
282 changes: 30 additions & 252 deletions src/main/java/org/wltea/analyzer/dic/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
package org.wltea.analyzer.dic;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
Expand Down Expand Up @@ -201,6 +200,28 @@ public FileVisitResult visitFileFailed(Path file, IOException e) {
return files;
}

private void loadDictFile(DictSegment dict, Path file, boolean critical, String name) {
try (InputStream is = new FileInputStream(file.toFile())) {
BufferedReader br = new BufferedReader(
new InputStreamReader(is, "UTF-8"), 512);
String word = br.readLine();
if (word != null) {
if (word.startsWith("\uFEFF"))
word = word.substring(1);
for (; word != null; word = br.readLine()) {
word = word.trim();
if (word.isEmpty()) continue;
dict.fillSegment(word.toCharArray());
}
}
} catch (FileNotFoundException e) {
logger.error("ik-analyzer: " + name + " not found", e);
if (critical) throw new RuntimeException("ik-analyzer: " + name + " not found!!!", e);
} catch (IOException e) {
logger.error("ik-analyzer: " + name + " loading failed", e);
}
}

public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = getProperty(EXT_DICT);
Expand Down Expand Up @@ -371,37 +392,7 @@ private void loadMainDict() {

// 读取主词典文件
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);

InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error(e.getMessage(), e);
}

try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);

} catch (IOException e) {
logger.error("ik-analyzer", e);

} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_MainDict, file, false, "Main Dict");
// 加载扩展词典
this.loadExtDict();
// 加载远程自定义词库
Expand All @@ -415,44 +406,11 @@ private void loadExtDict() {
// 加载扩展词典配置
List<String> extDictFiles = getExtDictionarys();
if (extDictFiles != null) {
InputStream is = null;
for (String extDictName : extDictFiles) {
// 读取扩展词典文件
logger.info("[Dict Loading] " + extDictName);
Path file = PathUtils.get(extDictName);
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer", e);
}

// 如果找不到扩展的字典,则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展词典数据到主内存词典中
_MainDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);

} catch (IOException e) {
logger.error("ik-analyzer", e);
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_MainDict, file, false, "Extra Dict");
}
}
}
Expand Down Expand Up @@ -533,80 +491,17 @@ private void loadStopWordDict() {

// 读取主词典文件
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);

InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error(e.getMessage(), e);
}

try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);

} catch (IOException e) {
logger.error("ik-analyzer", e);

} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_StopWords, file, false, "Main Stopwords");

// 加载扩展停止词典
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
if (extStopWordDictFiles != null) {
is = null;
for (String extStopWordDictName : extStopWordDictFiles) {
logger.info("[Dict Loading] " + extStopWordDictName);

// 读取扩展词典文件
file = PathUtils.get(extStopWordDictName);
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer", e);
}
// 如果找不到扩展的字典,则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展停止词典数据到内存中
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);

} catch (IOException e) {
logger.error("ik-analyzer", e);

} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_StopWords, file, false, "Extra Stopwords");
}
}

Expand Down Expand Up @@ -639,142 +534,25 @@ private void loadQuantifierDict() {
_QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer", e);
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);

} catch (IOException ioe) {
logger.error("Quantifier Dictionary loading exception.");

} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_QuantifierDict, file, false, "Quantifier");
}

private void loadSurnameDict() {

_SurnameDict = new DictSegment((char) 0);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer", e);
}
if (is == null) {
throw new RuntimeException("Surname Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SurnameDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer", e);
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_SurnameDict, file, true, "Surname");
}

private void loadSuffixDict() {

_SuffixDict = new DictSegment((char) 0);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer", e);
}
if (is == null) {
throw new RuntimeException("Suffix Dictionary not found!!!");
}
try {

BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer", e);
} finally {
try {
is.close();
is = null;
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_SuffixDict, file, true, "Suffix");
}

private void loadPrepDict() {

_PrepDict = new DictSegment((char) 0);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
logger.error("ik-analyzer", e);
}
if (is == null) {
throw new RuntimeException("Preposition Dictionary not found!!!");
}
try {

BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {

_PrepDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
logger.error("ik-analyzer", e);
} finally {
try {
is.close();
is = null;
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
loadDictFile(_PrepDict, file, true, "Preposition");
}

public void reLoadMainDict() {
Expand Down

0 comments on commit 21a859a

Please sign in to comment.