Merge pull request #83 from twitter/fix_stemming_bug

Fix stemming bug
twitter · Nov 11, 2015 · 8ddec2e · 8ddec2e
2 parents 673c093 + a17ed29
commit 8ddec2e
Show file tree

Hide file tree

Showing 17 changed files with 336 additions and 280 deletions.
diff --git a/src/main/resources/com/twitter/penguin/korean/util/adjective/adjective.txt b/src/main/resources/com/twitter/penguin/korean/util/adjective/adjective.txt
@@ -325,6 +325,7 @@
 아름답
 아무렇
 아쉽
+아찔하
 아프
 안남
 안녕하

diff --git a/src/main/resources/com/twitter/penguin/korean/util/auxiliary/exclamation.txt b/src/main/resources/com/twitter/penguin/korean/util/auxiliary/exclamation.txt
@@ -170,6 +170,7 @@
 프흐흐
 플
 피
+하
 하모
 하뿔싸
 하아

diff --git a/src/main/resources/com/twitter/penguin/korean/util/noun/foreign.txt b/src/main/resources/com/twitter/penguin/korean/util/noun/foreign.txt
@@ -421,10 +421,13 @@
 아나타노
 아노하나
 아뒤
+아따시
 아라마키
 아라비안
 아라키타
 아로마
+아로이
+아롱이
 아메미야
 아미
 아바타
@@ -442,6 +445,7 @@
 아이시떼루
 아이에프
 아이컨택
+아임인러브
 아즈마
 아츠야
 아카기
@@ -471,6 +475,7 @@
 야매
 야스토모
 야오
+야토가미
 얀데레
 양웬리
 어덜트
@@ -926,6 +931,7 @@
 피카추
 피티오
 피팅
+피피티
 핀업
 하나요
 하데스
@@ -947,6 +953,7 @@
 하이파이브회
 하이퍼
 하이퍼마켓
+하젠
 하치만
 하치켄
 하코네

diff --git a/src/main/resources/com/twitter/penguin/korean/util/noun/nouns.txt b/src/main/resources/com/twitter/penguin/korean/util/noun/nouns.txt
@@ -14771,6 +14771,8 @@
 아메바
 아멘
 아명
+아무
+아무나
 아바마마
 아방궁
 아버님
@@ -20123,7 +20125,6 @@
 제승
 제시
 제씨
-제아무리
 제안
 제압
 제약

diff --git a/src/main/resources/com/twitter/penguin/korean/util/noun/profane.txt b/src/main/resources/com/twitter/penguin/korean/util/noun/profane.txt
@@ -37,6 +37,8 @@
 씹새야
 야걸
 야동
+야설
+야애니
 엔조이
 오나니
 오피

diff --git a/src/main/resources/com/twitter/penguin/korean/util/substantives/suffix.txt b/src/main/resources/com/twitter/penguin/korean/util/substantives/suffix.txt
@@ -62,5 +62,6 @@
 킬로
 킬로미터
 틱
+하
 형
 화
diff --git a/src/main/resources/com/twitter/penguin/korean/util/typos/typos.txt b/src/main/resources/com/twitter/penguin/korean/util/typos/typos.txt
@@ -28,6 +28,7 @@
 그래욤 그래요
 그랫어요 그랬어요
 그러시져 그러시죠
+그로묜 그러면
 그지같 거지같
 그쵸 그렇죠
 글애요 그래요

diff --git a/src/main/resources/com/twitter/penguin/korean/util/verb/eomi.txt b/src/main/resources/com/twitter/penguin/korean/util/verb/eomi.txt
@@ -489,6 +489,7 @@
 마다에게
 마다의
 마따나
+마라
 마저
 마저나마라도
 마저도

diff --git a/src/main/resources/com/twitter/penguin/korean/util/verb/verb.txt b/src/main/resources/com/twitter/penguin/korean/util/verb/verb.txt
@@ -1288,7 +1288,9 @@
 피우
 피하
 하
+하고프
 하므
+하옵
 한잔하
 한정되
 한정하

diff --git a/src/main/scala/com/twitter/penguin/korean/stemmer/KoreanStemmer.scala b/src/main/scala/com/twitter/penguin/korean/stemmer/KoreanStemmer.scala
@@ -25,8 +25,8 @@ object KoreanStemmer {
     }
 
     val stemmed = tokens.foldLeft(List[KoreanToken]()) {
-      case (l: List[KoreanToken], token: KoreanToken) if Endings.contains(token.pos) =>
-        if (!l.isEmpty && Predicates.contains(l.head.pos)) {
+      case (l: List[KoreanToken], token: KoreanToken) if l.nonEmpty && Endings.contains(token.pos) =>
+        if (Predicates.contains(l.head.pos)) {
           val prevToken = l.head
           KoreanToken(
             prevToken.text,

diff --git a/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala b/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala
@@ -28,19 +28,20 @@ import scala.collection.JavaConversions._
 import scala.collection.mutable
 
 /**
- * Provides Korean tokenization.
- *
- * Chunk: 어절 - 공백으로 구분되어 있는 단위 (사랑하는사람을)
- * Word: 단어 - 하나의 문장 구성 요소 (사랑하는, 사람을)
- * Token: 토큰 - 형태소와 비슷한 단위이지만 문법적으로 정확하지는 않음 (사랑, 하는, 사람, 을)
- *
- * Whenever there is an updates in the behavior of KoreanParser,
- * the initial cache has to be updated by running tools.CreateInitialCache.
- */
+  * Provides Korean tokenization.
+  *
+  * Chunk: 어절 - 공백으로 구분되어 있는 단위 (사랑하는사람을)
+  * Word: 단어 - 하나의 문장 구성 요소 (사랑하는, 사람을)
+  * Token: 토큰 - 형태소와 비슷한 단위이지만 문법적으로 정확하지는 않음 (사랑, 하는, 사람, 을)
+  *
+  * Whenever there is an updates in the behavior of KoreanParser,
+  * the initial cache has to be updated by running tools.CreateInitialCache.
+  */
 object KoreanTokenizer {
   private val TOP_N_PER_STATE = 5
   private val MAX_TRACE_BACK = 8
 
+  // Lower score is better
   private val WEIGHT_TOKENS = 0.18f
   private val WEIGHT_UNKNOWNS = 0.3f
   private val WEIGHT_WORDS = 0.3f
@@ -51,16 +52,22 @@ object KoreanTokenizer {
   private val WEIGHT_ALL_NOUN = 0.1f
   private val WEIGHT_PREFFERED_PATTERN = 0.6f
   private val WEIGHT_DETERMINER = -0.01f
+  private val WEIGHT_EXCLAMATION = 0.01f
+
+  // supress suffix when tied
+  private val WEIGHT_INITIAL_POSTPOSITION = 0.2f
+
+  private val WEIGHT_HA_VERB = 0.3f
+
+  private val PREFERRED_PATTERNS = Seq(Seq(Noun, Josa), Seq(ProperNoun, Josa))
 
-  private val PREFERRED_PATTERN_NOUN = Seq(Noun, Josa)
-  private val PREFERRED_PATTERN_PROPER_NOUN = Seq(ProperNoun, Josa)
 
   /**
-   * A candidate parse for a chunk.
-   *
-   * @param posNodes Sequence of KoreanTokens.
-   * @param words Number of words in this candidate parse.
-   */
+    * A candidate parse for a chunk.
+    *
+    * @param posNodes Sequence of KoreanTokens.
+    * @param words Number of words in this candidate parse.
+    */
   case class ParsedChunk(posNodes: Seq[KoreanToken], words: Int) {
     def ++(that: ParsedChunk) = {
       ParsedChunk(this.posNodes ++ that.posNodes, this.words + that.words)
@@ -75,17 +82,30 @@ object KoreanTokenizer {
       isExactMatch * WEIGHT_EXACT_MATCH +
       isAllNouns * WEIGHT_ALL_NOUN +
       isPreferredPattern * WEIGHT_PREFFERED_PATTERN +
-      countPos(Determiner) * WEIGHT_DETERMINER
+      countPos(Determiner) * WEIGHT_DETERMINER +
+      countPos(Exclamation) * WEIGHT_EXCLAMATION +
+      isInitialPostPosition * WEIGHT_INITIAL_POSTPOSITION +
+      isNounHa * WEIGHT_HA_VERB
 
     lazy val countUnknowns = this.posNodes.count { p: KoreanToken => p.unknown }
     lazy val countTokens = this.posNodes.size
 
+    val suffixes = Set(Suffix, Eomi, Josa, PreEomi)
+
+    val preferredBeforeHaVerb = Set(Noun, ProperNoun, VerbPrefix)
+
+    lazy val isInitialPostPosition = if (suffixes.contains(this.posNodes.head.pos)) 1 else 0
     lazy val isExactMatch = if (this.posNodes.size == 1) 0 else 1
     lazy val isAllNouns = if (this.posNodes.exists(t => t.pos != Noun && t.pos != ProperNoun)) 1 else 0
     lazy val isPreferredPattern = if (
-      posNodes.size == 2 &&
-        (posNodes.map(_.pos) == PREFERRED_PATTERN_NOUN || posNodes.map(_.pos) == PREFERRED_PATTERN_PROPER_NOUN)
-    ) 0 else 1
+      posNodes.size == 2 && PREFERRED_PATTERNS.contains(posNodes.map(_.pos))
+    ) 0
+    else 1
+
+    lazy val isNounHa = if (this.posNodes.size >= 2
+      && preferredBeforeHaVerb.contains(this.posNodes.head.pos)
+      && this.posNodes(1).pos == Verb
+      && this.posNodes(1).text.startsWith("하")) 0 else 1
 
     lazy val posTieBreaker = this.posNodes.map(_.pos.id).sum
 
@@ -114,33 +134,33 @@ object KoreanTokenizer {
   }
 
   /**
-   * 0 for optional, 1 for required
-   * * for optional repeatable, + for required repeatable
-   *
-   * Substantive: 체언 (초거대기업의)
-   * Predicate: 용언 (하였었습니다, 개예뻤었다)
-   * Modifier: 수식언 (모르는 할수도있는 보이기도하는 예뻐 예쁜 완전 레알 초인간적인 잘 잘한)
-   * Standalone: 독립언
-   * Functional: 관계언 (조사)
-   *
-   * N Noun: 명사 (Nouns, Pronouns, Company Names, Proper Noun, Person Names, Numerals, Standalone, Dependent)
-   * V Verb: 동사 (하, 먹, 자, 차)
-   * J Adjective: 형용사 (예쁘다, 크다, 작다)
-   * A Adverb: 부사 (잘, 매우, 빨리, 반드시, 과연)
-   * D Determiner: 관형사 (새, 헌, 참, 첫, 이, 그, 저)
-   * E Exclamation: 감탄사 (헐, ㅋㅋㅋ, 어머나, 얼씨구)
-   *
-   * C Conjunction: 접속사
-   *
-   * j SubstantiveJosa: 조사 (의, 에, 에서)
-   * l AdverbialJosa: 부사격 조사 (~인, ~의, ~일)
-   * e Eomi: 어말어미 (다, 요, 여, 하댘ㅋㅋ)
-   * r PreEomi: 선어말어미 (었)
-   *
-   * p NounPrefix: 접두사 ('초'대박)
-   * v VerbPrefix: 동사 접두어 ('쳐'먹어)
-   * s Suffix: 접미사 (~적)
-   */
+    * 0 for optional, 1 for required
+    * * for optional repeatable, + for required repeatable
+    *
+    * Substantive: 체언 (초거대기업의)
+    * Predicate: 용언 (하였었습니다, 개예뻤었다)
+    * Modifier: 수식언 (모르는 할수도있는 보이기도하는 예뻐 예쁜 완전 레알 초인간적인 잘 잘한)
+    * Standalone: 독립언
+    * Functional: 관계언 (조사)
+    *
+    * N Noun: 명사 (Nouns, Pronouns, Company Names, Proper Noun, Person Names, Numerals, Standalone, Dependent)
+    * V Verb: 동사 (하, 먹, 자, 차)
+    * J Adjective: 형용사 (예쁘다, 크다, 작다)
+    * A Adverb: 부사 (잘, 매우, 빨리, 반드시, 과연)
+    * D Determiner: 관형사 (새, 헌, 참, 첫, 이, 그, 저)
+    * E Exclamation: 감탄사 (헐, ㅋㅋㅋ, 어머나, 얼씨구)
+    *
+    * C Conjunction: 접속사
+    *
+    * j SubstantiveJosa: 조사 (의, 에, 에서)
+    * l AdverbialJosa: 부사격 조사 (~인, ~의, ~일)
+    * e Eomi: 어말어미 (다, 요, 여, 하댘ㅋㅋ)
+    * r PreEomi: 선어말어미 (었)
+    *
+    * p NounPrefix: 접두사 ('초'대박)
+    * v VerbPrefix: 동사 접두어 ('쳐'먹어)
+    * s Suffix: 접미사 (~적)
+    */
   private val SequenceDefinition = Map(
     // Substantive
     "D0p*N1s0j0" -> Noun,
@@ -164,15 +184,15 @@ object KoreanTokenizer {
   case class PossibleTrie(curTrie: KoreanPosTrie, words: Int)
 
   /**
-   * Find the best parse using dynamic programming.
-   *
-   * @param chunk Input chunk. The input has to be entirely. Check for input validity is skipped
-   *              for performance optimization. This method is private and is called only by tokenize.
-   * @return The best possible parse.
-   */
+    * Find the best parse using dynamic programming.
+    *
+    * @param chunk Input chunk. The input has to be entirely. Check for input validity is skipped
+    *              for performance optimization. This method is private and is called only by tokenize.
+    * @return The best possible parse.
+    */
   private[this] def parseKoreanChunk(chunk: KoreanToken): Seq[KoreanToken] = {
-
     // Direct match
+    // This may produce 하 -> PreEomi
     koreanDictionary.foreach {
       case (pos, dict) =>
         if (dict.contains(chunk.text)) {
@@ -245,11 +265,11 @@ object KoreanTokenizer {
   }
 
   /**
-   * Parse Korean text into a sequence of KoreanTokens
-   *
-   * @param text Input Korean chunk
-   * @return sequence of KoreanTokens
-   */
+    * Parse Korean text into a sequence of KoreanTokens
+    *
+    * @param text Input Korean chunk
+    * @return sequence of KoreanTokens
+    */
   def tokenize(text: CharSequence): Seq[KoreanToken] = {
     try {
       chunk(text).flatMap {

diff --git a/src/test/resources/com/twitter/penguin/korean/util/adj_conjugate.txt b/src/test/resources/com/twitter/penguin/korean/util/adj_conjugate.txt
@@ -320,6 +320,7 @@
 씩씩하	씩씩하, 씩씩하거, 씩씩하게, 씩씩하겠, 씩씩하고, 씩씩하구, 씩씩하기, 씩씩하긴, 씩씩하길, 씩씩하냐, 씩씩하네, 씩씩하노, 씩씩하느, 씩씩하는, 씩씩하니, 씩씩하다, 씩씩하더, 씩씩하던, 씩씩하도, 씩씩하든, 씩씩하러, 씩씩하려, 씩씩하며, 씩씩하면, 씩씩하세, 씩씩하셔, 씩씩하셨, 씩씩하습, 씩씩하시, 씩씩하신, 씩씩하실, 씩씩하십, 씩씩하여, 씩씩하였, 씩씩하자, 씩씩하잖, 씩씩하재, 씩씩하져, 씩씩하죠, 씩씩하지, 씩씩하진, 씩씩하질, 씩씩한, 씩씩할, 씩씩함, 씩씩합, 씩씩해, 씩씩해도, 씩씩해서, 씩씩해써, 씩씩해야, 씩씩해준, 씩씩했, 씩씩히
 씬나	씬나, 씬나거, 씬나게, 씬나겠, 씬나고, 씬나구, 씬나기, 씬나긴, 씬나길, 씬나냐, 씬나네, 씬나노, 씬나느, 씬나는, 씬나니, 씬나다, 씬나더, 씬나던, 씬나도, 씬나든, 씬나러, 씬나려, 씬나며, 씬나면, 씬나서, 씬나세, 씬나셔, 씬나셨, 씬나습, 씬나시, 씬나신, 씬나실, 씬나십, 씬나써, 씬나야, 씬나자, 씬나잖, 씬나재, 씬나져, 씬나죠, 씬나준, 씬나지, 씬나진, 씬나질, 씬난, 씬날, 씬남, 씬납, 씬났
 아깝	아까, 아까운, 아까워, 아까웠, 아깝
+아찔하	아찔하, 아찔하거, 아찔하게, 아찔하겠, 아찔하고, 아찔하구, 아찔하기, 아찔하긴, 아찔하길, 아찔하냐, 아찔하네, 아찔하노, 아찔하느, 아찔하는, 아찔하니, 아찔하다, 아찔하더, 아찔하던, 아찔하도, 아찔하든, 아찔하러, 아찔하려, 아찔하며, 아찔하면, 아찔하세, 아찔하셔, 아찔하셨, 아찔하습, 아찔하시, 아찔하신, 아찔하실, 아찔하십, 아찔하여, 아찔하였, 아찔하자, 아찔하잖, 아찔하재, 아찔하져, 아찔하죠, 아찔하지, 아찔하진, 아찔하질, 아찔한, 아찔할, 아찔함, 아찔합, 아찔해, 아찔해도, 아찔해서, 아찔해써, 아찔해야, 아찔해준, 아찔했, 아찔히
 아니	아녀, 아녔, 아니, 아니냐, 아니노, 아니느, 아니는, 아니니, 아니어, 아니었, 아닌, 아닐, 아님, 아닙, 아닙니
 아련하	아련하, 아련하거, 아련하게, 아련하겠, 아련하고, 아련하구, 아련하기, 아련하긴, 아련하길, 아련하냐, 아련하네, 아련하노, 아련하느, 아련하는, 아련하니, 아련하다, 아련하더, 아련하던, 아련하도, 아련하든, 아련하러, 아련하려, 아련하며, 아련하면, 아련하세, 아련하셔, 아련하셨, 아련하습, 아련하시, 아련하신, 아련하실, 아련하십, 아련하여, 아련하였, 아련하자, 아련하잖, 아련하재, 아련하져, 아련하죠, 아련하지, 아련하진, 아련하질, 아련한, 아련할, 아련함, 아련합, 아련해, 아련해도, 아련해서, 아련해써, 아련해야, 아련해준, 아련했, 아련히
 아름답	아름다, 아름다운, 아름다워, 아름다웠, 아름답
-Original file line number
+Diff line change
@@ Expand Up / @@ -170,6 +170,7 @@ @@
     프흐흐
     플
     피
+    하
     하모
     하뿔싸
     하아
@@ Expand Down @@