Skip to content

Commit

Permalink
1) Added more shorttext languages from Shuyo's language-detection pro…
Browse files Browse the repository at this point in the history
…ject

2) Added TechnicalLanguageDetectorImplTest.testDetector5() to test N-gram for the Malay language.
  • Loading branch information
eclectice committed Oct 5, 2016
1 parent c5fcb81 commit 467e52f
Show file tree
Hide file tree
Showing 36 changed files with 106 additions and 29 deletions.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ apply plugin: 'java'

tasks.withType(JavaCompile) {
options.encoding = 'UTF-8'
options.compilerArgs << "-Xlint:-deprecation" << "-Xlint:unchecked"
}

// In this section you declare where to find the dependencies of your project
Expand Down
46 changes: 21 additions & 25 deletions gradlew
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,12 @@
##
##############################################################################

# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS=""

APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`

# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS=""

# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"

Expand All @@ -48,7 +30,6 @@ die ( ) {
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
Expand All @@ -59,11 +40,26 @@ case "`uname`" in
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac

# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null

CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar

# Determine the Java command to use to start the JVM.
Expand All @@ -89,7 +85,7 @@ location of your Java installation."
fi

# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
Expand Down
8 changes: 4 additions & 4 deletions gradlew.bat
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal

@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS=

set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%

@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS=

@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome

Expand Down Expand Up @@ -46,7 +46,7 @@ echo location of your Java installation.
goto fail

:init
@rem Get command-line arguments, handling Windows variants
@rem Get command-line arguments, handling Windowz variants

if not "%OS%" == "Windows_NT" goto win9xME_args
if "%@eval[2+2]" == "4" goto 4NT_args
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,23 +94,55 @@ public class BuiltInLanguages {

static {
List<String> texts = new ArrayList<>();

texts.add("ar");
texts.add("bg");
texts.add("bn");
texts.add("ca");
texts.add("cs");
texts.add("da");
texts.add("de");
texts.add("el");
texts.add("en");
texts.add("es");
texts.add("et");
texts.add("fa");
texts.add("fi");
texts.add("fr");
texts.add("gu");
texts.add("he");
texts.add("hi");
texts.add("hr");
texts.add("hu");
texts.add("id");
texts.add("it");
texts.add("ja");
texts.add("ko");
texts.add("lt");
texts.add("lv");
texts.add("mk");
texts.add("ml");
texts.add("nl");
texts.add("no");
texts.add("pa");
texts.add("pl");
texts.add("pt");
texts.add("ro");
texts.add("ru");
texts.add("si");
texts.add("sq");
texts.add("sv");
texts.add("ta");
texts.add("te");
texts.add("th");
texts.add("tl");
texts.add("tr");
texts.add("uk");
texts.add("ur");
texts.add("vi");
texts.add("zh-cn");
texts.add("zh-tw");

shortTextLanguages = ImmutableList.copyOf(texts);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,10 @@ NGram.KANJI_7_32=\u4E11\u4F3A\u4F51\u5197\u51B6\u51F9\u52FF\u541F\u5507\u5589\u5
NGram.KANJI_7_33=\u4E4B\u4E86\u4E94\u4EA4\u4EAC\u4ECA\u4ED6\u4EF6\u4EFB\u4F9B\u4FDD\u4FE1\u5143\u5148\u5149\u518D\u5217\u521D\u5305\u5341\u534A\u53C8\u53CD\u53D6\u53D7\u53E3\u53E4\u53EF\u53F2\u53F8\u5404\u5411\u5468\u547D\u54C1\u5546\u5668\u56DB\u56DE\u56E0\u571F\u578B\u57CE\u57DF\u5883\u58EB\u592A\u592E\u5973\u59CB\u59D4\u5B57\u5B58\u5B89\u5B98\u5C11\u5C31\u5C40\u5C55\u5DDD\u5E03\u5E38\u5E9C\u5F15\u5F62\u5F71\u5F97\u5FC3\u60C5\u610F\u624B\u6280\u6301\u63A5\u63A8\u63D0\u652F\u6539\u653E\u6559\u65BD\u65CF\u661F\u66F2\u671D\u672A\u6797\u679C\u6821\u683C\u6B7B\u6BD4\u6C34\u6C5F\u6CB3\u6D3B\u6D41\u6E2F\u6E90\u6F14\u7136\u7248\u738B\u7403\u76F4\u7701\u77E5\u77F3\u7814\u793A\u795E\u798F\u7A0B\u7A76\u7A7A\u7BA1\u7C73\u7F6E\u7F8E\u80B2\u81F3\u822C\u8272\u8457\u88AB\u89E3\u8A00\u8C61\u8D77\u8DEF\u8EAB\u8FD1\u9020\u91CC\u91CF\u91D1\u9650\u9662\u96C6\u975E\u9762\u97F3\u9996\u9999
NGram.KANJI_7_35=\u55C5\u57A2\u58D5\u59E5\u637A\u74E2\u7CE0\u895F
NGram.KANJI_7_37=\u4E19\u4E32\u4E4F\u4E91\u4EC7\u4ED4\u4F0D\u5141\u51E1\u51F6\u51F8\u52AB\u535C\u53C9\u53DB\u540A\u5410\u54C0\u559D\u5750\u5751\u576A\u57E0\u5824\u582A\u5830\u5835\u5851\u5858\u586B\u5954\u59FB\u5A46\u5B5F\u5BB4\u5BD3\u5C16\u5C60\u5CFB\u5D16\u5E16\u5E3D\u5E7D\u5E87\u5ECA\u5FD9\u60DC\u60F9\u6155\u6167\u6234\u626E\u6276\u6284\u633A\u6377\u6492\u649E\u64B0\u6562\u6591\u65A5\u65E6\u65FA\u6602\u670B\u676D\u68AF\u695A\u6B23\u6BC5\u6C70\u6C83\u6CE1\u6D8C\u6DD8\u6E20\u71D5\u72D0\u72D7\u73B2\u73CA\u7433\u7483\u74DC\u74F6\u7554\u764C\u7761\u77DB\u78A7\u7A46\u7A7F\u7A84\u7C97\u7D2F\u7FC1\u7FE0\u8000\u8017\u808C\u80AF\u8404\u8461\u8463\u8475\u8513\u85AA\u8679\u86CB\u871C\u87BA\u88F8\u8C8C\u8DF3\u8FC4\u901D\u9022\u906E\u9075\u9192\u91C7\u966A\u971E\u9910\u9B41\u9F0E\u9F20
TO_NORMALIZE_VI_CHARS=AEIOUYaeiouy\u00c2\u00ca\u00d4\u00e2\u00ea\u00f4\u0102\u0103\u01a0\u01a1\u01af\u01b0
DMARK_CLASS=\u0300\u0301\u0303\u0309\u0323
NORMALIZED_VI_CHARS_0300=\u00C0\u00C8\u00CC\u00D2\u00D9\u1EF2\u00E0\u00E8\u00EC\u00F2\u00F9\u1EF3\u1EA6\u1EC0\u1ED2\u1EA7\u1EC1\u1ED3\u1EB0\u1EB1\u1EDC\u1EDD\u1EEA\u1EEB
NORMALIZED_VI_CHARS_0301=\u00C1\u00C9\u00CD\u00D3\u00DA\u00DD\u00E1\u00E9\u00ED\u00F3\u00FA\u00FD\u1EA4\u1EBE\u1ED0\u1EA5\u1EBF\u1ED1\u1EAE\u1EAF\u1EDA\u1EDB\u1EE8\u1EE9
NORMALIZED_VI_CHARS_0303=\u00C3\u1EBC\u0128\u00D5\u0168\u1EF8\u00E3\u1EBD\u0129\u00F5\u0169\u1EF9\u1EAA\u1EC4\u1ED6\u1EAB\u1EC5\u1ED7\u1EB4\u1EB5\u1EE0\u1EE1\u1EEE\u1EEF
NORMALIZED_VI_CHARS_0309=\u1EA2\u1EBA\u1EC8\u1ECE\u1EE6\u1EF6\u1EA3\u1EBB\u1EC9\u1ECF\u1EE7\u1EF7\u1EA8\u1EC2\u1ED4\u1EA9\u1EC3\u1ED5\u1EB2\u1EB3\u1EDE\u1EDF\u1EEC\u1EED
NORMALIZED_VI_CHARS_0323=\u1EA0\u1EB8\u1ECA\u1ECC\u1EE4\u1EF4\u1EA1\u1EB9\u1ECB\u1ECD\u1EE5\u1EF5\u1EAC\u1EC6\u1ED8\u1EAD\u1EC7\u1ED9\u1EB6\u1EB7\u1EE2\u1EE3\u1EF0\u1EF1
1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ar

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/bg

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/bn

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ca

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/el

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/et

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/fa

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/gu

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/he

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/hi

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/hr

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/hu

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ja

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ko
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"freq":{"¤":8,"¸":6,"·":50,"´":30," 가ㅏ":15," 가ㅎ":12," 가ㅈ":7," 가ㅋ":98," 가ㅅ":7," 가ㅇ":26," 가ㅁ":13," 가ㅜ":11," 가ㅐ":7," 가ㅓ":10," 가ㄹ":31," 가ㄴ":29," 가ㄱ":20,"ㄱㄱ ":11," 가ㅣ":10," 가ㅠ":20," 가ㅡ":9,"ˇ":6,"˝":6,"ㅅ가가":25,"ω":19,"가가ㄹ":54,"가가ㄱ":37,"가가ㄴ":92,"가가ㄷ":32,"가가ㅡ":50,"가가ㅠ":600,"가가ㅣ":46,"가가ㅜ":179,"가가ㅗ":8,"가가ㅔ":17,"가가ㅓ":33,"가가ㅐ":12,"가가ㅑ":7,"가가ㅏ":68,"가가ㅎ":104,"가가ㅌ":12,"가가ㅋ":1102,"가가ㅈ":10,"가가ㅇ":106," ㄹ ":6,"가가ㅆ":10,"가가ㅅ":29,"가가ㅂ":6,"가가ㅁ":20," ´":22," ·":8,"가가」":32,"가가「":31,"가가』":11,"가가】":7," ㅁ ":63," ㅂ ":19," ㅇ ":81,"ㅡㅡㅡ":10," ㅅ ":66," ㄱ ":11," ㄷ ":9," ω":10," ㄴ ":6,"ㅈ가가":16,"ㅠㅜㅠ":47,"ㅠㅜㅜ":54,"ㅠㅠㅋ":9,"ㅠㅠㅜ":47,"ㅠㅠㅠ":2506,"ㅠㅡㅠ":12,"가/가":6,"ㅇ가가":82,"๑":12,"ㅠㅋㅋ":8,"า":6,"ᆢ":11,"ᆞ":18," "":8," ノ":19," 。":6,"  ̄":6,"가가▶":13,"ㅇㅇ가":12," ๑":9,"ㅁ가가":17,"´ ":8,"】가":7,"가가→":10,"www":10,"「가":83,"【가":7,"」가":33,"『가":17,"· ":6," ㅠ ":124," ㅡ ":19," ㅜ ":20," ㅍ ":10," ㅎ ":47," ㅋ ":92,"ㅂ가가":11,"ㅋㅋ가":741,"∇":9,"∀":10,"가ㅣ ":7,"ㄹㄹ ":7,"⊙":14,"가ㅜ ":14,"⌒":10,"가ㅠ ":57," 」 ":29," 「 ":8,"가ㅓ ":12,"ノ ":9,"가ㅋ ":48,"가ㅐ ":6,"↑":19,"→":21,"↗":6,"가ㅎ ":19,"가ㅏ ":16," ⊙":7,"가ㅇ ":18," ⌒":8,"가ㄹ ":19,"ㅋㄱ가":10,"가ㅁ ":11,"가ㄱ ":12,"가ㄴ ":20,"가가ᆢ":6,"가가ᆞ":11,"┌":6,"─":14," →":8,"▶":46,"▷":7,"▽":21,"□":11,"◐":11,"◑":9,"◇":9,"●":18,"○":9,"ㄷㄷ ":24,"ww":11," ▽":10," ▶":25," ▷":7," ●":13,"ㅠ가ㅠ":9," 丈가":9,"가·가":38,"▶▶ ":6,"ああ ":6," ̄ ":6," 【":11," 『":14," 「":59," 」":64,"  ":6,"ㅣ":106,"ㅡ":253,"ㅠ":5116,"ㅔ":42,"ㅕ":20,"ㅗ":65,"ㅐ":36,"ㄹ가가":56,"ㅑ":14,"ㅓ":75,"ㅜ":1064,"ㅛ":9,"ㅅ":213,"ㅇ":635,"ㅆ":17,"ㅁ":191,"ㅃ":11,"ㅂ":77,"ㅍ":30,"ㅌ":55,"ㅏ":203,"ㅎ":887,"ㅉ":19,"ㅈ":58,"ㅋ":18985,"ㅊ":18,"ㄲ":26,"ㄱ":238,"ㄷ":173,"ㄴ":225,"ㄸ":15,"ㄹ":159," ア ":15,"あ":51,"? ":6," 가 ":12034," ":15,"』":15,"『":17,"」":105,"「":104,"【":14,"】":14,"" ":8,"〜":7,"ア":38,"─ ":6,"ㄴㄴ ":8,"ㅎㅎ가":30," ㅜ":143," ㅗ":7," ㅋ":1426," ㅉ":6," ㅈ":23," ㅏ":9," ㅎ":263," ㅍ":18," ㅌ":10," ㅂ":30," ㅁ":82," ㅇ":266," ㅅ":95," ㄸ":11," ㄹ":15," ㄱ":36," ㄲ":7," ㄴ":38," ㄷ":40,"가가 ":101467," ㅡ":74," ㅠ":709," ㅣ":10,"가가·":32," あ":16," ア":23,"」가 ":10," あ ":11,"人人人":6,"` ":9,"「가 ":16,"ㄱ가 ":13," ̄ ̄":8,"。 ":6,"ㅣㅣ":7,"ㅣㅇ":6,"ㅡㅡ":84,"ㅡㅠ":13,"ㅡㅜ":11,"ㅠㅠ":3579,"ㅠㅡ":17,"/가가":6,"ㅕㅕ":7,"ㅗㅗ":38,"ㅓㅓ":14,"ㅜㅜ":559,"ㅜㅡ":11,"ㅜㅠ":119,"ㅠㅋ":9,"ㅠㅜ":132,"ㅅㅎ":10,"ㅅㅇ":34,"ㅅㅂ":18,"ㅇㅠ":6,"ㅈㄹ":6,"ㅇㅏ":8,"ㅇㅇ":75,"ㅇㅎ":8,"ㅇㅋ":9,"ㅇㅁ":52,"ㅇㅂ":10,"ㅇㅅ":42,"ㅁㅇ":49,"ㅁㅁ":7,"ㅂㅇ":10,"ㅎㅇ":13,"ㅎㅅ":10,"ㄴ가가":75," "가":7,"ㅌㅌ":9,"ㅏㅏ":52,"ㅎㅎ":468,"ㅉㅉ":8,"ㅁㅇ ":40,"ㅋㅋ":16188,"ㅋㅌ":14,"ㅋㅠ":7,"ㅌㅋ":25,"ㅋㄱ":75,"ㅋㄲ":6,"ㄱㄱ":49,"ㄱㅋ":55,"ㄷㄷ":78,"ㄴㄴ":29,"/가":6,"ㄸㄹ":12,"ㄹㄹ":17,"ㄱㅋㅋ":32,"ㄱㅋㄱ":12,"之":27,"中":11,"並":16,"丘":15,"丈":12,"三":53,"丁":17,"人":10,"亞":8,"亂":8,"ㅣ가 ":13,""가":7,"ㄷ가가":6,"三三":7," ▽ ":10,"가ㄷㄷ":10,"가ㄱㄱ":8," ▶ ":12," 丘":7,"가ㅇㅇ":15," 丈":12," 三":20," 丁":7,"가ㅇㅁ":8,"가ㅇㅅ":10,"ㄱㄱㅋ":9,"ㄱㄱㄱ":15,"가ㅏㅏ":19,"가ㅎㅎ":69,"가ㅌㅋ":6,"가ㅋㅋ":1130," ● ":9,"가ㅓㅓ":6,"가ㅜㅜ":118,"가ㅜㅠ":23,"가ㅠㅜ":32," 中":10," 之":13,"가ㅡㅡ":32,"가ㅠㅡ":6,"가ㅠㅠ":506,"人人":7,"→ ":9,"↑ ":6,"ㄱ가가":41,"ㅠ가 ":26," ㅠ가":9," ㅡ가":7," ㅎ가":8," ㅋ가":10," ㅈ가":7," ㅇ가":23," ㅅ가":7,"ああ":27,"▶가":7," ㅁ가":6," ㄴ가":8,"ㅋㄱ ":6," ㄱ가":6,"ㅇㅇ ":37,"▶가가":7,"」「":6,"ㅏ가ㅏ":9,"ㅜ가 ":7,"ㅔ가 ":8,""가가":6,"ᆞ가가":11,"ㄸㄹㄹ":9,"ω ":11,"ㅅㅂ ":9,"ㅅㅇ ":21,"⌒ ":8,"ㅠㅜ가":9,"ㅂㅇ ":7,"↑↑":8,"ㅜㅜ가":53,"가ㄹ":87,"가ㄷ":36,"가ㄴ":124,"가ㄲ":6,"가ㄱ":59,"가ㅛ":6,"가ㅜ":196,"가ㅑ":9,"가ㅐ":19,"가ㅓ":48,"가ㅕ":6,"가ㅔ":24,"가ㅗ":10,"가ㅈ":18,"가ㅋ":1235,"가ㅌ":14,"가ㅍ":6,"가ㅎ":120,"가ㅏ":101,"가ㅁ":34,"가ㅂ":8,"가ㅅ":40,"가ㅆ":11,"가ㅇ":137,"가ㅣ":62,"가ㅠ":655,"가ㅡ":61,"ㄷㄷㄷ":37,"가』":14,"가「":35,"가」":34,"ㅋ가ㅋ":25," 之 ":8,"가】":7,"ㅋ가ㅠ":14,"ㅅ가 ":16,"가ᆞ":12,"가ᆢ":8,"가가가":150443,"ㅇ가 ":33,"】 ":6,"』 ":10,"」 ":44,"「 ":14,"ㅋㅋ ":955,"ㅎㅎ ":124,"👍":18,"ㅠㅠ가":224," 가가":104979,"あ ":18,"ㅡㅡ가":9,"가→":11,"→가가":7,"ㅋ가 ":92,"ㅏㅏ ":10," 『가":14," 「가":48," 」가":15,"ㅈ가 ":7,"ㅣ가가":37,"·가가":33,"가─":7,"가▶":13," ノ ":9,"ㄷ ":39,"ㅡ가가":28,"ㄴ ":40,"ㄲ ":8,"ㄱ ":44,"ㅁ ":79,"ㄹ ":42,"ㅈ ":12,"ㅇㅇㅇ":10,"๑ ":8,"ㅇㅅㅇ":32,"ㅇ ":221,"ㅅ ":79,"ㅂ ":31,"가" ":8," 가":117510,"ㅇㅁㅇ":47,"ㅏ가 ":14,"가/":6,"가"":8,"ㅇㅂㅇ":10,"가ㅡ가":11,"가ㅠ가":26,"가ㅣ가":40,"ㅠ가가":227,"가ㅜ가":14,"가ㅓ가":15,"가ㅔ가":10,"가ㅏ가":51,"가ㅐ가":9,"가ㅋ가":23,"가":378431,"가ㅎ가":16,"三가":7,"丈가":9,"ㅋ ":1110,"ㅍ ":13,"ㅏ ":40,"ㅎ ":201,"丈가가":8,"ㅐ ":10,"ㅓ ":17,"ㅔ ":10,"ㅗ ":8,"ㅜ ":141,"ㅠ ":694,"ㅡ ":73,"ㅣ ":13,"ㅋㅠㅠ":6,"가·":39,"가 ":114163,"ㅌㅋㅋ":19,"ㅋㅌㅋ":11,"ㅋㅋㅠ":6,"ㅓ가 ":8,"ㅋㅋㄲ":6,"ㅋㅋㄱ":50,"ㅋㅋㅋ":13704,"ㅋㅋㅌ":9,"ア ":21,"ㅋㄱㄱ":18,"ㅋㄱㅋ":36,"ㅜ가가":72,"👍 ":6,"ㅎㅇㅎ":6,"가ㅁ가":17,"ㅎㅅㅎ":9,"가ㅂ가":7,"가ㅅ가":32,"ㅎㅎㅎ":232,"가ㅇ가":65,"가ㅈ가":13,"가ㄱ가":30,"가ㄴ가":90,"가ㄹ가":63,"가ㄷ가":13,"ㅜ가":86,"ㅣ가":59,"ㅠ가":267,"ㅡ가":33,"ㅔ가":19,"ㅓ가":21,"ㅏㅏㅏ":32,"ㅎ가":57," ㅡㅡ":39,"ㅍ가":7," ㅠㅠ":504," ㅠㅡ":8,"ㅌ가":11,"ㅋ가":782," ㅠㅜ":15,"ㅑ가":6,"ㅐ가":14,"ㅏ가":79,"ㅅ가":49,"ㅆ가":8,"ㅇ가":122,"ㅜㅠ ":18,"ㅈ가":24," ㅜㅜ":90," ㅜㅠ":12,"ㅜㅜ ":88,"ㅁ가":27,"ㅂ가":15," ㅇㅏ":6," ㅇㅇ":48," ㅋㄱ":6,"ㄴ가":131,"ㄷ가":20," ㅋㅋ":1269," ㅎㅎ":164,"ㄹ가":82," ㅎㅅ":8," ㅎㅇ":7,"가ᆞ가":8," ㅅㅂ":13,"ㄴ가 ":46,"ㄱ가":57," ㅇㅁ":43," ㅇㅂ":9," ㅇㅅ":30," → ":7,"·가":40,"?":18," ?":17,"▶ ":18,"ㅓㅓㅓ":8,"▽ ":12,"◇ ":6,"ㅠㅜ ":11,"ㅓ가가":9,"?":8,"?":38,"● ":13,"ㅠㅠ ":476,"ㄴㄴ가":14,"가가/":6,"가가"":8,"가👍":12,"ㄹ가 ":17,"ㅡㅠ ":9,"ㅔ가가":8,"ㅡㅡ ":47,"가」가":18,"가「가":27,"?가":9,"가?":15,"ㄷ가 ":12,"👍가":9,"「가가":66,"ㅏ가가":50,"」가가":19,"ㅗㅗㅗ":29," ω ":9," ▶가":7,"『가가":16,"ㅐ가가":6,"ᆞ가":12,"?가가":9,"【가가":7,"ㅎ가가":45,"】가가":6,"丘 ":9,"가가?":15,"가→가":7,""":16,")":8,"(":8,"/":13,"^":10,"_":8,"`":11,"w":13,"・":8,"。":8,"ノ":19,"゚":8," ̄":20,"가가":257747,"ㅋ가가":638,"ㅜㅠㅠ":44,"ㅜㅠㅜ":34,"▶▶":6,"ㅜㅡㅜ":6," ⌒ ":8,"ㅁ가 ":9,"□□":7,"三 ":15,"가』 ":10,"가」 ":11,"ㅜㅜㅡ":6,"ㅜㅜㅠ":32,"ㅜㅜㅜ":289,"丁 ":7,"ㅌ가가":8," ㄴㄴ":18," ㄱㄱ":7," ㄸㄹ":10," ㄷㄷ":27,"→가":7,"あああ":17,"之 ":11},"n_words":[408659,524156,400873],"name":"ko"}
1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/lt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/lv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/mk

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/pa

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ru

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/si

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/sq

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ta

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/te

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/th

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/tl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/uk

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/ur

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/zh-cn

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/main/resources/languages.shorttext/zh-tw

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public class TechnicalLanguageDetectorImplTest {
private static final String TRAINING_EN = "a a a b b c c d e";
private static final String TRAINING_FR = "a b b c c c d d d";
private static final String TRAINING_JA = "\u3042 \u3042 \u3042 \u3044 \u3046 \u3048 \u3048";
private static final String TRAINING_MS = "a a e e e m m n n";


private LanguageDetector makeDetector() {
Expand All @@ -51,6 +52,9 @@ private LanguageDetector makeDetector() {
profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("ja"));
add(detectorBuilder, profileBuilder, TRAINING_JA);

profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("ms"));
add(detectorBuilder, profileBuilder, TRAINING_MS);

return detectorBuilder.build();
}
private void add(LanguageDetectorBuilder detectorBuilder, LanguageProfileBuilder profileBuilder, String trainingEn) {
Expand Down Expand Up @@ -84,4 +88,11 @@ public final void testDetector4() {
LanguageDetector languageDetector = makeDetector();
assertEquals(languageDetector.detect("\u3042\u3042\u3042\u3042a").get().getLanguage(), "ja");
}

@Test
public final void testDetector5() {
LanguageDetector languageDetector = makeDetector();
assertEquals(languageDetector.detect("a e").get().getLanguage(), "ms");
}

}

0 comments on commit 467e52f

Please sign in to comment.