Added Language dependent Configuration (0.0.8)

+ Added a filter attribute to deactivate the high pass filter (Default: filter activated) + Added language dependent configuration + Filter possible tags with Yaki itself + Added README.md content + Added an Inspector + Fixed some Bugs
rashinari · Mar 19, 2015 · e295990 · e295990
1 parent 284fd0b
commit e295990
Show file tree

Hide file tree

Showing 6 changed files with 175 additions and 53 deletions.
diff --git a/.versions b/.versions
@@ -8,12 +8,12 @@ ejson@1.0.6
 geojson-utils@1.0.3
 id-map@1.0.3
 json@1.0.3
-local-test:nefiltari:yaki@0.0.6
+local-test:nefiltari:yaki@0.0.7
 logging@1.0.7
 meteor@1.1.5
 minimongo@1.0.7
 mongo@1.1.0
-nefiltari:yaki@0.0.6
+nefiltari:yaki@0.0.7
 ordered-dict@1.0.3
 random@1.0.3
 retry@1.0.3

diff --git a/README.md b/README.md
@@ -1,8 +1,56 @@
 # Yaki
 Yaki can capture relevant tags from any bunch of text. Works on the client and on the server. 
 
-*Beware*: This is an early alpha test release and NOT suitable for production.
+Features from Yaki:
+- Uses term normalizations to construct a list of terms
+- Uses Stopword Lists and a language dependent alphabet as dictionaries
+- Calculates tag relevance via statisitcal methods: like entropy and standard normal distribution
+- Uses [n-Gram](http://en.wikipedia.org/wiki/N-gram) for stemming and simmilarity detection
+- Can find word combinations (in case of multiple occurences)
+- Currently supported languages: english and german
+- Uses language dependent feature configurations to improve QoS
 
-#### License
+Text Retrieval classification: *morphology* and parts of *syntax* (without vocabulary)
+
+***Beware***: This is an early alpha test release and NOT suitable for production.
+
+### Installation
+
+```shell
+  $ meteor add nefiltari:yaki
+```
+
+### How-To
+For simple tagging (most features are activated by default) use following syntax: 
+```coffee
+  console.log Yaki("This is a sample text to demonstrate the tagging engine `Yaki`.").extract()
+  # -> [ 'demonstrate', 'yaki', 'engine', 'tagging' ]
+```
+
+If you know the language then you can specify this as second parameter (use the Top Level Domain abbreviation).
+The default language is english.
+Use additional (maybe) known tags to add a stronger weight to some words.
+```coffee
+  text = "Dieser Beispieltext demonstriert das Tagging in deutscher Sprache."
+  console.log Yaki(text, {language: 'de', tags: ['yaki']}).extract()
+  # -> [ 'yaki', 'demonstriert', 'beispieltext', 'deutscher', 'sprache' ]
+```
+
+You can normalize and `clean()` an array of words, fragments or tags with Yaki.
+```coffee
+  fragments = ['(legend)', 'advanced.', 'MultiColor', '-> HTTP <-']
+  console.log Yaki(fragments).clean()
+  # -> [ 'legend', 'advanced', 'multicolor', 'http' ]
+```
+
+### ToDo
+
+- [ ] Instead of transferring the heavy stopword-lists to the client, proxy client requests through
+  a server method
+- [x] Improve the algorithm to find multi-word phrases instead of just single words
+- [x] Refactor the source code to improve readability and performance even further
+- [ ] Add more test cases to ensure quality and enable better collaboration
+
+### License
 
 This code is licenced under the LGPL 3.0. Do whatever you want with this code, but I'd like to get improvements and bugfixes back. 
diff --git a/lib/configuration.coffee.md b/lib/configuration.coffee.md
@@ -0,0 +1,44 @@
+# Configuration
+This file provides each language their special foible in calculating significance.
+
+    Configuration = @Configuration =
+    
+## English
+The english language has more shorter and lesser capitalized words. Asign also a stronger
+weight on word entropy because the (most) short an simple word variations
+
+      en:
+        # Stemming with k-gramm (Yaki.stem)
+        k: 4
+        similarity: 0.6
+        # Calculation (Yaki.calculate)
+        entropieStrength: 3
+        frequencyCoefficient: 1.0
+        capitalizeBonus: 10
+        akkronymBonus: 15
+        positionCoefficient: 1.0
+        tagBonus: 20
+        # Word Combination (Yaki.combine)
+        combinationOccurences: 2
+        # Analyse (Yaki.analyse)
+        minQuality: 3
+        
+## German
+The german language has very long and more capitalized words. The words needs a
+softer simmilarity level because more word variations (morphology).
+
+      de:
+        # Stemming with k-gramm (Yaki.stem)
+        k: 4
+        similarity: 0.4
+        # Calculation (Yaki.calculate)
+        entropieStrength: 2
+        frequencyCoefficient: 1.0
+        capitalizeBonus: 2
+        akkronymBonus: 15
+        positionCoefficient: 1.0
+        tagBonus: 20
+        # Word Combination (Yaki.combine)
+        combinationOccurences: 2
+        # Analyse (Yaki.analyse)
+        minQuality: 5
diff --git a/lib/yaki.coffee.md b/lib/yaki.coffee.md
@@ -4,6 +4,7 @@ The `context` in an optional object that have follwoing keys:
 - `language`: Language abbreviation (TLD-Specification) (Default: 'en')
 - `natural`: Use only natural words instead of words like: foo_bar, foo-bar, 1__0 (Default: true)
 - `moderated`: Is this a moderated text (true/false) (Default: true)
+- `filter`: Enables or disables the high pass filter for found tags (Default: true)
 - `tags`: Possible an array of tags that **can** describe this article (more wheight on these tags when found)
 
 
@@ -12,18 +13,24 @@ The `context` in an optional object that have follwoing keys:
         @context = context if context
         return this
       else
+        # Initialize
         dictionary = new Array
-        dictionary.context = context or {}
-        lang = dictionary.context.language or 'en'
-        lang = if _.contains(Vocabulary.support, lang) then lang else 'en'
-        dictionary.context.language = lang
+        # Context
+        context = context or {}
+        unless _.contains(Yaki.Vocabulary.support, context.language)
+          context.language = 'en'
+        context.tags = Yaki(context.tags).clean() if context.tags
+        dictionary.context = context
+        # Methods
         dictionary.split = Yaki.split
         dictionary.clean = Yaki.clean
         dictionary.stem = Yaki.stem
         dictionary.calculate = Yaki.calculate
         dictionary.combine = Yaki.combine
         dictionary.rank = Yaki.rank
         dictionary.extract = Yaki.extract
+        dictionary.inspect = Yaki.inspect
+        # Text
         if _.isArray text
           dictionary.text = text.join(' ')
           dictionary = dictionary.split()
@@ -44,37 +51,33 @@ Stopword in multiple languages to filter high frequently words.
 ## Configuration
 The algorithms need some metrics and variables to do the right things in an acceptable range.
 
-    Yaki.Config = 
-      # Stemming (Yaki.stem)
-      k: 4
-      similarity: 0.4
-      # Calculation (Yaki.calculate)
-      entropieStrength: 2
-      frequencyCoefficient: 1.0
-      positionCoefficient: 1.0
-      capitalizeBonus: 5
-      akkronymBonus: 20
-      tagBonus: 30
-      # Word Combination (Yaki.combine) & Analyse (Yaki.analyse)
-      combinationOccurences: 2
-      minQuality: 5
+    Yaki.Configuration = Configuration
+    
+## Inspector
+The inspect routine for node's console.log.
+
+    Yaki.inspect = (dictionary) -> 
+      Npm.require('util').inspect _.toArray this or dictionary
     
 ## Useful Helpers
 This helpers are useful for internal functionality.
 
 ### `normalize`
 Normalize a word (with special characters) to a term.
 
-    normalize = (str, type) ->
+    normalize = (entry) ->
       # Negates dot notations from akkronymes: U.S.A. -> USA
       # Each logical piece or fragment from a word is sgned by an '_' e.g. 9/11 -> 9_11, hallo- -> hallo_
       # All underscores from begin and end are trimed: e.g. _hallo_ -> hallo
-      # Each normalized word (term) is converted into lower case e.g. USA -> usa, 
-      str = str.replace(/\./g, '') if type is 'akkr'
-      str = str.replace(/[\/\\\.\-\#\+\*\:\,\?\'\"\`\´\=\&\%\$\§\!\(\)\]\[\<\>\;\^\°]/g, '_')
+      # Each normalized word (term) is converted into lower case e.g. USA -> usa,
+      str = entry.term
+      str = str.replace(/\./g, '') if entry.type is 'akro'
+      str = str.replace(/\'/g, '')
+      str = str.replace(/[\/\\\.\-\#\+\*\:\,\?\"\`\´\=\&\%\$\§\!\(\)\]\[\<\>\;\^\°]/g, '_')
       str = str.replace(/^\_*/, '')
       str = str.replace(/\_*$/, '')
-      str.toLowerCase()
+      entry.term = str.toLowerCase()
+      entry
     
 ### `toKGram`    
 Convert a term to a k-gram. For better index construction an optional callback can call each k-gram piece. Minimum for k is 2.
@@ -127,20 +130,22 @@ Clean the result. Define a term type and normalize each word. Filter the list wi
       return dictionary unless dictionary.terms
       lang = dictionary.context.language
       # Define language dependent reqex's
-      vocabular = Vocabulary[lang]
+      upper = Vocabulary[lang].uppercase
+      lower = Vocabulary[lang].lowercase
       regex = [
-        new RegExp "^[#{vocabular.uppercase}#{vocabular.lowercase}]\\."
-        new RegExp "^[#{vocabular.uppercase}]{2,}"
-        new RegExp "^[#{vocabular.uppercase}]"
+        new RegExp "^[^#{upper}#{lower}]*[#{lower}#{upper}]\\."
+        new RegExp "^[^#{upper}#{lower}]*[#{upper}]{2,}"
+        new RegExp "^[^#{upper}#{lower}]*[#{upper}]"
       ]
       # Determine type and normalize each term
-      for entry, id in dictionary.terms
+      last = null
+      dictionary.terms = dictionary.terms.map (entry) ->
         entry.type = switch
-          when regex[0].test(entry.term) then 'akkr'  # matches U.S.A, u.s.a.
-          when regex[1].test(entry.term) then 'akkr'  # matches USA, HTTP but not A (single letter)
+          when regex[0].test(entry.term) then 'akro'  # matches U.S.A, u.s.a.
+          when regex[1].test(entry.term) then 'akro'  # matches USA, HTTP but not A (single letter)
           when regex[2].test(entry.term) then 'capi'  # matches capitalized words
           else 'norm'
-        entry.term = normalize(entry.term, entry.type)
+        last = normalize entry
       # Count Words (before the any filter steps in)
       dictionary.words = dictionary.terms.length
       # Filter blank terms
@@ -152,7 +157,7 @@ Clean the result. Define a term type and normalize each word. Filter the list wi
           new RegExp(/\_/).test entry.term
       # Filter with Stopwords
       dictionary.terms = _.reject dictionary.terms, (entry) ->
-        _.contains Yaki.Stopwords[lang], entry.term
+        entry.type isnt 'akro' and _.contains Yaki.Stopwords[lang], entry.term
       # Recalculate Id's and link to result
       dictionary.length = 0
       for entry, id in dictionary.terms
@@ -168,13 +173,15 @@ Convert each term into a token. Each token has multiple occurences in text. That
       return dictionary unless dictionary.terms
       dictionary.index = {}
       dictionary.similarities = []
+      lang = dictionary.context.language
+      config = Yaki.Configuration[lang]
       for entry, id in dictionary.terms
         # Initialize some Variables
         candidates = {}
         count = 0
         max = 0
         # Process the K-Gram and gather data about possible similarities
-        toKGram entry.term, Yaki.Config.k, (gram) ->
+        toKGram entry.term, config.k, (gram) ->
           # Insert gram into index with term id and
           # Fill the similarity vector with possible similarities
           # The variables max sign the similariest founded term
@@ -194,7 +201,7 @@ Convert each term into a token. Each token has multiple occurences in text. That
           for candidate, intersect of candidates when intersect is max
             # Dice:    ( 2 * |Intersect(a,b)| ) / ( |a| + |b| )
             distance = (2 * intersect) / (dictionary.terms[candidate*1].kGramCount + count)
-            if distance > Yaki.Config.similarity and distance > similarity
+            if distance > config.similarity and distance > similarity
               similarity = distance
               best = candidate*1
           # Similar Term found (best): Register in dictionary.similarities (counter array)
@@ -219,28 +226,30 @@ Calculates each token entropy with language vocabular and token frequency. Add b
       dictionary = Yaki.call this, dictionary, context
       return dictionary unless dictionary.terms
       lang = dictionary.context.language
+      config = Yaki.Configuration[lang]
       for entry, id in dictionary.terms when not entry.quality?
         # Step 1: Basic Entropy (included word length and relative term frequency)
         quality = entropy entry.term, Yaki.Vocabulary[lang].frequencies
+        quality = Math.round quality
         #  Strengthen the entropie
-        quality = Math.pow quality, Yaki.Config.entropieStrength
+        quality = Math.pow quality, config.entropieStrength
         # Step 2: Term Frequency: Lun'sches law (Config: Coefficient)
         frequency = if entry.similar? then dictionary.similarities[entry.similar].length else 1
-        quality = quality * (Yaki.Config.frequencyCoefficient * frequency)
+        quality = quality * (config.frequencyCoefficient * frequency)
         # Step 3: Capitalized word bonus and Akkronym Bonsu (Config: Bonus)
-        quality = quality + Yaki.Config.capitalizeBonus if entry.type is 'capi'
-        quality = quality + Yaki.Config.akkronymBonus if entry.type is 'akkr'
+        quality = quality + config.capitalizeBonus if entry.type is 'capi'
+        quality = quality + config.akkronymBonus if entry.type is 'akro'
         # Step 4: Term Position (Config: Coefficient) (isnt false for '===')
         #if dictionary.context.moderated isnt false
           # Standard normal distribution (Normalized for x and y)
           # Construct the normalized value from id (currently disabled)
           #x = -4 + (id / dictionary.terms.length) * 8
           #weight = 1 - ((1 / Math.sqrt(2 * Math.PI)) * Math.exp(-0.5*x*x))
-          #quality = quality * Yaki.Config.positionCoefficient * weight
+          #quality = quality * config.positionCoefficient * weight
         # Step 5: Known Context Tags (Config: Bonus)
         if dictionary.context.tags?
           if _.contains dictionary.context.tags, entry.term
-            quality = quality + Yaki.Config.tagBonus
+            quality = quality + config.tagBonus
         dictionary.terms[id].quality = Math.round quality
       return dictionary
         
@@ -250,18 +259,20 @@ Find any word combinations and semantical rules between words/terms.
     Yaki.combine = (dictionary, context) ->
       dictionary = Yaki.call this, dictionary, context
       return dictionary unless dictionary.similarities
+      lang = dictionary.context.language
+      config = Yaki.Configuration[lang]
       for similarity, sid in dictionary.similarities
         combo = {}
         best = -1
         quality = 0
-        for tid in similarity when dictionary.terms[tid].quality >= Yaki.Config.minQuality
+        for tid in similarity
           current = dictionary.terms[tid]
           next = dictionary.terms[tid+1]
-          if next? and next.similar? and next.quality >= Yaki.Config.minQuality
+          if next? and next.similar?
             if (current.position+1) is next.position 
               # Gather different similar classes that direct follow a term          
               combo[next.similar] = (combo[next.similar] or 0) + 1
-              if combo[next.similar] >= Yaki.Config.combinationOccurences # high pass
+              if combo[next.similar] >= config.combinationOccurences
                 if (combo[next.similar] * next.quality) > quality
                   best = next.similar
                   quality = combo[next.similar] * next.quality
@@ -290,11 +301,14 @@ This function is an full standard process for text mining and analysing and comb
     Yaki.extract = (dictionary, context) ->
       dictionary = Yaki.call this, dictionary, context
       return dictionary unless dictionary.text
+      lang = dictionary.context.language
+      config = Yaki.Configuration[lang]
       dictionary = dictionary.split().clean().stem().calculate().combine().rank()
       result = dictionary.ranking
       # Step 1: Filter the ranking (high pass) by a minimum quality
-      result = _.filter result, (entry) ->
-        entry.quality > Yaki.Config.minQuality
+      if dictionary.context.filter isnt false
+        result = _.filter result, (entry) ->
+          entry.quality >= config.minQuality
       # Step 2: Filter terms that has the same similarity class (behold the best similar term)
       similarities = []
       result = _.filter result, (entry) ->

diff --git a/package.js b/package.js
@@ -1,6 +1,6 @@
 Package.describe({
   name: 'nefiltari:yaki',
-  version: '0.0.7',
+  version: '0.0.8',
   summary: 'Yaki can capture relevant tags from any bunch of text.',
   git: 'https://github.com/nefiltari/yaki.git',
   documentation: 'README.md'
@@ -13,6 +13,7 @@ Package.onUse(function(api) {
     'stopwords/stopwords_de.coffee',
     'stopwords/stopwords_en.coffee',
 	  'lib/vocabulary.coffee.md',
+    'lib/configuration.coffee.md',
 	  'lib/yaki.coffee.md',
     'globals.js'
   ],['client','server']);