From c5fd413b603e74cb57f2f3550726e134974c4588 Mon Sep 17 00:00:00 2001
From: "diego.esteves" <diego.esteves@farfetch.com>
Date: Thu, 30 Apr 2020 11:54:10 +0100
Subject: [PATCH] #51 - text features dict - feat extraction

---
 scripts/04_setup_cache.py        |  2 +-
 scripts/05_feature_extraction.py | 66 +++++++++++++++++++-------------
 src/horus_meta.py                | 53 ++++++++++++++++++++++++-
 3 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/scripts/04_setup_cache.py b/scripts/04_setup_cache.py
index bf2fb58..e35c4d6 100644
--- a/scripts/04_setup_cache.py
+++ b/scripts/04_setup_cache.py
@@ -40,7 +40,7 @@ def __download_image_local(image_url, image_type, thumbs_url, thumbs_type, term_
 def cache_images_and_news(horus: Horus):
     try:
         with SQLiteHelper(config.database_db) as sqlcon:
-            config.logger.info('caching results...')
+            config.logger.info('Im on it! keep calm and take a coffee! :)')
             horus_db = HorusDB(sqlcon)
             auxc = 1
             download = False
diff --git a/scripts/05_feature_extraction.py b/scripts/05_feature_extraction.py
index db6cf25..f29c08b 100644
--- a/scripts/05_feature_extraction.py
+++ b/scripts/05_feature_extraction.py
@@ -10,18 +10,21 @@
 
 
 def _append_word_lemma_stem(w, l, s):
-    t=[]
-    try: t.append(enc_word.transform(str(w)))
+    t = []
+    try:
+        t.append(enc_word.transform(str(w)))
     except:
         config.logger.warn('enc_word.transform error')
         t.append(0)
 
-    try: t.append(enc_lemma.transform(l.decode('utf-8')))
+    try:
+        t.append(enc_lemma.transform(l.decode('utf-8')))
     except:
         config.logger.warn('enc_lemma.transform error')
         t.append(0)
 
-    try: t.append(enc_stem.transform(s.decode('utf-8')))
+    try:
+        t.append(enc_stem.transform(s.decode('utf-8')))
     except:
         config.logger.warn('enc_stem.transform error')
         t.append(0)
@@ -30,35 +33,34 @@ def _append_word_lemma_stem(w, l, s):
 
 
 def _shape(word):
-    word_shape = 0 #'other'
+    word_shape = 0  # 'other'
     if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
-        word_shape = 1 #'number'
+        word_shape = 1  # 'number'
     elif re.match('\W+$', word):
-        word_shape = 2 #'punct'
+        word_shape = 2  # 'punct'
     elif re.match('[A-Z][a-z]+$', word):
-        word_shape = 3 #'capitalized'
+        word_shape = 3  # 'capitalized'
     elif re.match('[A-Z]+$', word):
-        word_shape = 4 # 'uppercase'
+        word_shape = 4  # 'uppercase'
     elif re.match('[a-z]+$', word):
-        word_shape = 5 #'lowercase'
+        word_shape = 5  # 'lowercase'
     elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word):
-        word_shape = 6 #'camelcase'
+        word_shape = 6  # 'camelcase'
     elif re.match('[A-Za-z]+$', word):
-        word_shape = 7 #'mixedcase'
+        word_shape = 7  # 'mixedcase'
     elif re.match('__.+__$', word):
-        word_shape = 8 # 'wildcard'
+        word_shape = 8  # 'wildcard'
     elif re.match('[A-Za-z0-9]+\.$', word):
-        word_shape = 9 # 'ending-dot'
+        word_shape = 9  # 'ending-dot'
     elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word):
-        word_shape = 10 # 'abbreviation'
+        word_shape = 10  # 'abbreviation'
     elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word):
-        word_shape = 11 #'contains-hyphen'
+        word_shape = 11  # 'contains-hyphen'
 
     return word_shape
 
 
 def _extract_lexical(horus: Horus) -> Horus:
-
     try:
         lx_dict = WordFeaturesInterface.get_lexical()
         tot_slide_brown_cluster = 5
@@ -70,10 +72,13 @@ def _extract_lexical(horus: Horus) -> Horus:
                 brown_640_path = '{:<016}'.format(dict_brown_c640.get(token.text, '0000000000000000'))
                 brown_320_path = '{:<016}'.format(dict_brown_c320.get(token.text, '0000000000000000'))
 
-                for i in range(0, tot_slide_brown_cluster-1):
-                    token.features.lexical.values[lx_dict_reversed.get('brown_1000.' + str(i+1))] = brown_1000_path[:i+1]
-                    token.features.lexical.values[lx_dict_reversed.get('brown_640.' + str(i+1))] = brown_640_path[:i+1]
-                    token.features.lexical.values[lx_dict_reversed.get('brown_320.' + str(i+1))] = brown_320_path[:i+1]
+                for i in range(0, tot_slide_brown_cluster - 1):
+                    token.features.lexical.values[lx_dict_reversed.get('brown_1000.' + str(i + 1))] = brown_1000_path[
+                                                                                                      :i + 1]
+                    token.features.lexical.values[lx_dict_reversed.get('brown_640.' + str(i + 1))] = brown_640_path[
+                                                                                                     :i + 1]
+                    token.features.lexical.values[lx_dict_reversed.get('brown_320.' + str(i + 1))] = brown_320_path[
+                                                                                                     :i + 1]
 
                 token.features.lexical.values[lx_dict_reversed.get('word.lower')] = token.text.lower()
 
@@ -92,7 +97,8 @@ def _extract_lexical(horus: Horus) -> Horus:
                 token.features.lexical.values[lx_dict_reversed.get('word.lemma')] = lemma
                 token.features.lexical.values[lx_dict_reversed.get('word.stem')] = stem
                 token.features.lexical.values[lx_dict_reversed.get('word.len.1')] = int(len(token.text) == 1)
-                token.features.lexical.values[lx_dict_reversed.get('word.has.special')] = int(len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token.text)) > 0)
+                token.features.lexical.values[lx_dict_reversed.get('word.has.special')] = int(
+                    len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token.text)) > 0)
                 token.features.lexical.values[lx_dict_reversed.get('word[0].isupper')] = int(token.text[0].isupper())
                 token.features.lexical.values[lx_dict_reversed.get('word.isupper')] = int(token.text.isupper())
                 token.features.lexical.values[lx_dict_reversed.get('word.istitle')] = int(token.text.istitle())
@@ -143,14 +149,19 @@ def extract_features(horus: Horus, lexical: bool = False, text: bool = False, im
         return True
 
     except Exception as e:
-        config.logger.error(str(e))
-        return e
+        config.logger.exception(str(e))
+        return False
 
 
 if __name__ == '__main__':
 
     config = HorusConfig()
 
+    # define the feature sets you want to extract
+    EXTRACT_LEXICAL = True
+    EXTRACT_TEXT = False
+    EXTRACT_IMAGE = False
+
     config.logger.info('loading lemmatizers')
     stemmer = SnowballStemmer('english')
     stop = set(stopwords.words('english'))
@@ -179,13 +190,14 @@ def extract_features(horus: Horus, lexical: bool = False, text: bool = False, im
         try:
             conll_file = ds[1] + ds[2]
             assert '.horusx' in conll_file
-            horus_file_stage2 = conll_file.replace('.horusx', '.horus1.json')
+            horus_file_stage2 = conll_file.replace('.horusx', '.horus2.json')
 
             config.logger.info('loading horus file: ' + horus_file_stage2)
             horus = HorusDataLoader.load_metadata_from_file(file=horus_file_stage2)
 
-            config.logger.info('feature extraction')
-            ok = extract_features(horus, lexical=True)
+            config.logger.info(f'feature extraction: '
+                               f'lexical: {EXTRACT_LEXICAL}, text: {EXTRACT_TEXT}, image: {EXTRACT_IMAGE}')
+            ok = extract_features(horus, lexical=EXTRACT_LEXICAL, text=EXTRACT_TEXT, image=EXTRACT_IMAGE)
             if not ok:
                 config.logger.warn('feature extraction: something went wrong...')
 
diff --git a/src/horus_meta.py b/src/horus_meta.py
index d4d56e8..1051e23 100644
--- a/src/horus_meta.py
+++ b/src/horus_meta.py
@@ -24,7 +24,58 @@ def get_visual() -> dict:
     @staticmethod
     def get_textual() -> dict:
         return {
-            0: 'blah'
+            0: 'total.global.results.search_engine',
+            1: 'total.retrieved.results.search_engine',
+            2: 'total.error.translation',
+            3: 'total.binary.k.loc',
+            4: 'total.binary.k.org',
+            5: 'total.binary.k.per',
+            6: 'total.binary.k.other',
+            7: 'top.binary.k',
+            8: 'dist.k',
+            9: 'total.topic.k.loc',
+            10: 'total.topic.k.org',
+            11: 'total.topic.k.per',
+            12: 'total.topic.k.other',
+            13: 'top.topic.k',
+            14: 'dist.k.topic_model',
+            15: 'total.emb.similar.loc',
+            16: 'total.emb.similar.org',
+            17: 'total.emb.similar.per',
+            18: 'total.emb.similar.other',
+            19: 'stats.topic.top.k.sum.loc',
+            20: 'stats.topic.top.k.sum.org',
+            21: 'stats.topic.top.k.sum.per',
+            22: 'stats.topic.top.k.sum.other',
+            23: 'stats.topic.top.k.avg.loc',
+            24: 'stats.topic.top.k.avg.org',
+            25: 'stats.topic.top.k.avg.per',
+            26: 'stats.topic.top.k.avg.other',
+            27: 'stats.topic.top.k.max.loc',
+            28: 'stats.topic.top.k.max.org',
+            29: 'stats.topic.top.k.max.per',
+            30: 'stats.topic.top.k.max.other',
+            31: 'stats.topic.top.k.min.loc',
+            32: 'stats.topic.top.k.min.org',
+            33: 'stats.topic.top.k.min.per',
+            34: 'stats.topic.top.k.min.other',
+            35: 'stats.topic.sum.loc',
+            36: 'stats.topic.sum.org',
+            37: 'stats.topic.sum.per',
+            38: 'stats.topic.sum.other',
+            39: 'stats.topic.avg.loc',
+            40: 'stats.topic.avg.org',
+            41: 'stats.topic.avg.per',
+            42: 'stats.topic.avg.other',
+            43: 'stats.topic.max.loc',
+            44: 'stats.topic.max.org',
+            45: 'stats.topic.max.per',
+            46: 'stats.topic.max.other',
+            47: 'stats.topic.min.loc',
+            48: 'stats.topic.min.org',
+            49: 'stats.topic.min.per',
+            50: 'stats.topic.min.other',
+
         }
 
     @staticmethod