From c5fd413b603e74cb57f2f3550726e134974c4588 Mon Sep 17 00:00:00 2001 From: "diego.esteves" Date: Thu, 30 Apr 2020 11:54:10 +0100 Subject: [PATCH] #51 - text features dict - feat extraction --- scripts/04_setup_cache.py | 2 +- scripts/05_feature_extraction.py | 66 +++++++++++++++++++------------- src/horus_meta.py | 53 ++++++++++++++++++++++++- 3 files changed, 92 insertions(+), 29 deletions(-) diff --git a/scripts/04_setup_cache.py b/scripts/04_setup_cache.py index bf2fb58..e35c4d6 100644 --- a/scripts/04_setup_cache.py +++ b/scripts/04_setup_cache.py @@ -40,7 +40,7 @@ def __download_image_local(image_url, image_type, thumbs_url, thumbs_type, term_ def cache_images_and_news(horus: Horus): try: with SQLiteHelper(config.database_db) as sqlcon: - config.logger.info('caching results...') + config.logger.info('Im on it! keep calm and take a coffee! :)') horus_db = HorusDB(sqlcon) auxc = 1 download = False diff --git a/scripts/05_feature_extraction.py b/scripts/05_feature_extraction.py index db6cf25..f29c08b 100644 --- a/scripts/05_feature_extraction.py +++ b/scripts/05_feature_extraction.py @@ -10,18 +10,21 @@ def _append_word_lemma_stem(w, l, s): - t=[] - try: t.append(enc_word.transform(str(w))) + t = [] + try: + t.append(enc_word.transform(str(w))) except: config.logger.warn('enc_word.transform error') t.append(0) - try: t.append(enc_lemma.transform(l.decode('utf-8'))) + try: + t.append(enc_lemma.transform(l.decode('utf-8'))) except: config.logger.warn('enc_lemma.transform error') t.append(0) - try: t.append(enc_stem.transform(s.decode('utf-8'))) + try: + t.append(enc_stem.transform(s.decode('utf-8'))) except: config.logger.warn('enc_stem.transform error') t.append(0) @@ -30,35 +33,34 @@ def _append_word_lemma_stem(w, l, s): def _shape(word): - word_shape = 0 #'other' + word_shape = 0 # 'other' if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word): - word_shape = 1 #'number' + word_shape = 1 # 'number' elif re.match('\W+$', word): - word_shape = 2 #'punct' + word_shape = 2 # 'punct' elif re.match('[A-Z][a-z]+$', word): - word_shape = 3 #'capitalized' + word_shape = 3 # 'capitalized' elif re.match('[A-Z]+$', word): - word_shape = 4 # 'uppercase' + word_shape = 4 # 'uppercase' elif re.match('[a-z]+$', word): - word_shape = 5 #'lowercase' + word_shape = 5 # 'lowercase' elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word): - word_shape = 6 #'camelcase' + word_shape = 6 # 'camelcase' elif re.match('[A-Za-z]+$', word): - word_shape = 7 #'mixedcase' + word_shape = 7 # 'mixedcase' elif re.match('__.+__$', word): - word_shape = 8 # 'wildcard' + word_shape = 8 # 'wildcard' elif re.match('[A-Za-z0-9]+\.$', word): - word_shape = 9 # 'ending-dot' + word_shape = 9 # 'ending-dot' elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word): - word_shape = 10 # 'abbreviation' + word_shape = 10 # 'abbreviation' elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word): - word_shape = 11 #'contains-hyphen' + word_shape = 11 # 'contains-hyphen' return word_shape def _extract_lexical(horus: Horus) -> Horus: - try: lx_dict = WordFeaturesInterface.get_lexical() tot_slide_brown_cluster = 5 @@ -70,10 +72,13 @@ def _extract_lexical(horus: Horus) -> Horus: brown_640_path = '{:<016}'.format(dict_brown_c640.get(token.text, '0000000000000000')) brown_320_path = '{:<016}'.format(dict_brown_c320.get(token.text, '0000000000000000')) - for i in range(0, tot_slide_brown_cluster-1): - token.features.lexical.values[lx_dict_reversed.get('brown_1000.' + str(i+1))] = brown_1000_path[:i+1] - token.features.lexical.values[lx_dict_reversed.get('brown_640.' + str(i+1))] = brown_640_path[:i+1] - token.features.lexical.values[lx_dict_reversed.get('brown_320.' + str(i+1))] = brown_320_path[:i+1] + for i in range(0, tot_slide_brown_cluster - 1): + token.features.lexical.values[lx_dict_reversed.get('brown_1000.' + str(i + 1))] = brown_1000_path[ + :i + 1] + token.features.lexical.values[lx_dict_reversed.get('brown_640.' + str(i + 1))] = brown_640_path[ + :i + 1] + token.features.lexical.values[lx_dict_reversed.get('brown_320.' + str(i + 1))] = brown_320_path[ + :i + 1] token.features.lexical.values[lx_dict_reversed.get('word.lower')] = token.text.lower() @@ -92,7 +97,8 @@ def _extract_lexical(horus: Horus) -> Horus: token.features.lexical.values[lx_dict_reversed.get('word.lemma')] = lemma token.features.lexical.values[lx_dict_reversed.get('word.stem')] = stem token.features.lexical.values[lx_dict_reversed.get('word.len.1')] = int(len(token.text) == 1) - token.features.lexical.values[lx_dict_reversed.get('word.has.special')] = int(len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token.text)) > 0) + token.features.lexical.values[lx_dict_reversed.get('word.has.special')] = int( + len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token.text)) > 0) token.features.lexical.values[lx_dict_reversed.get('word[0].isupper')] = int(token.text[0].isupper()) token.features.lexical.values[lx_dict_reversed.get('word.isupper')] = int(token.text.isupper()) token.features.lexical.values[lx_dict_reversed.get('word.istitle')] = int(token.text.istitle()) @@ -143,14 +149,19 @@ def extract_features(horus: Horus, lexical: bool = False, text: bool = False, im return True except Exception as e: - config.logger.error(str(e)) - return e + config.logger.exception(str(e)) + return False if __name__ == '__main__': config = HorusConfig() + # define the feature sets you want to extract + EXTRACT_LEXICAL = True + EXTRACT_TEXT = False + EXTRACT_IMAGE = False + config.logger.info('loading lemmatizers') stemmer = SnowballStemmer('english') stop = set(stopwords.words('english')) @@ -179,13 +190,14 @@ def extract_features(horus: Horus, lexical: bool = False, text: bool = False, im try: conll_file = ds[1] + ds[2] assert '.horusx' in conll_file - horus_file_stage2 = conll_file.replace('.horusx', '.horus1.json') + horus_file_stage2 = conll_file.replace('.horusx', '.horus2.json') config.logger.info('loading horus file: ' + horus_file_stage2) horus = HorusDataLoader.load_metadata_from_file(file=horus_file_stage2) - config.logger.info('feature extraction') - ok = extract_features(horus, lexical=True) + config.logger.info(f'feature extraction: ' + f'lexical: {EXTRACT_LEXICAL}, text: {EXTRACT_TEXT}, image: {EXTRACT_IMAGE}') + ok = extract_features(horus, lexical=EXTRACT_LEXICAL, text=EXTRACT_TEXT, image=EXTRACT_IMAGE) if not ok: config.logger.warn('feature extraction: something went wrong...') diff --git a/src/horus_meta.py b/src/horus_meta.py index d4d56e8..1051e23 100644 --- a/src/horus_meta.py +++ b/src/horus_meta.py @@ -24,7 +24,58 @@ def get_visual() -> dict: @staticmethod def get_textual() -> dict: return { - 0: 'blah' + 0: 'total.global.results.search_engine', + 1: 'total.retrieved.results.search_engine', + 2: 'total.error.translation', + 3: 'total.binary.k.loc', + 4: 'total.binary.k.org', + 5: 'total.binary.k.per', + 6: 'total.binary.k.other', + 7: 'top.binary.k', + 8: 'dist.k', + 9: 'total.topic.k.loc', + 10: 'total.topic.k.org', + 11: 'total.topic.k.per', + 12: 'total.topic.k.other', + 13: 'top.topic.k', + 14: 'dist.k.topic_model', + 15: 'total.emb.similar.loc', + 16: 'total.emb.similar.org', + 17: 'total.emb.similar.per', + 18: 'total.emb.similar.other', + 19: 'stats.topic.top.k.sum.loc', + 20: 'stats.topic.top.k.sum.org', + 21: 'stats.topic.top.k.sum.per', + 22: 'stats.topic.top.k.sum.other', + 23: 'stats.topic.top.k.avg.loc', + 24: 'stats.topic.top.k.avg.org', + 25: 'stats.topic.top.k.avg.per', + 26: 'stats.topic.top.k.avg.other', + 27: 'stats.topic.top.k.max.loc', + 28: 'stats.topic.top.k.max.org', + 29: 'stats.topic.top.k.max.per', + 30: 'stats.topic.top.k.max.other', + 31: 'stats.topic.top.k.min.loc', + 32: 'stats.topic.top.k.min.org', + 33: 'stats.topic.top.k.min.per', + 34: 'stats.topic.top.k.min.other', + 35: 'stats.topic.sum.loc', + 36: 'stats.topic.sum.org', + 37: 'stats.topic.sum.per', + 38: 'stats.topic.sum.other', + 39: 'stats.topic.avg.loc', + 40: 'stats.topic.avg.org', + 41: 'stats.topic.avg.per', + 42: 'stats.topic.avg.other', + 43: 'stats.topic.max.loc', + 44: 'stats.topic.max.org', + 45: 'stats.topic.max.per', + 46: 'stats.topic.max.other', + 47: 'stats.topic.min.loc', + 48: 'stats.topic.min.org', + 49: 'stats.topic.min.per', + 50: 'stats.topic.min.other', + } @staticmethod