Skip to content

Commit

Permalink
#51
Browse files Browse the repository at this point in the history
- text features dict
- feat extraction
  • Loading branch information
diegoesteves committed Apr 30, 2020
1 parent e0fe1dd commit c5fd413
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 29 deletions.
2 changes: 1 addition & 1 deletion scripts/04_setup_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __download_image_local(image_url, image_type, thumbs_url, thumbs_type, term_
def cache_images_and_news(horus: Horus):
try:
with SQLiteHelper(config.database_db) as sqlcon:
config.logger.info('caching results...')
config.logger.info('Im on it! keep calm and take a coffee! :)')
horus_db = HorusDB(sqlcon)
auxc = 1
download = False
Expand Down
66 changes: 39 additions & 27 deletions scripts/05_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@


def _append_word_lemma_stem(w, l, s):
t=[]
try: t.append(enc_word.transform(str(w)))
t = []
try:
t.append(enc_word.transform(str(w)))
except:
config.logger.warn('enc_word.transform error')
t.append(0)

try: t.append(enc_lemma.transform(l.decode('utf-8')))
try:
t.append(enc_lemma.transform(l.decode('utf-8')))
except:
config.logger.warn('enc_lemma.transform error')
t.append(0)

try: t.append(enc_stem.transform(s.decode('utf-8')))
try:
t.append(enc_stem.transform(s.decode('utf-8')))
except:
config.logger.warn('enc_stem.transform error')
t.append(0)
Expand All @@ -30,35 +33,34 @@ def _append_word_lemma_stem(w, l, s):


def _shape(word):
word_shape = 0 #'other'
word_shape = 0 # 'other'
if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
word_shape = 1 #'number'
word_shape = 1 # 'number'
elif re.match('\W+$', word):
word_shape = 2 #'punct'
word_shape = 2 # 'punct'
elif re.match('[A-Z][a-z]+$', word):
word_shape = 3 #'capitalized'
word_shape = 3 # 'capitalized'
elif re.match('[A-Z]+$', word):
word_shape = 4 # 'uppercase'
word_shape = 4 # 'uppercase'
elif re.match('[a-z]+$', word):
word_shape = 5 #'lowercase'
word_shape = 5 # 'lowercase'
elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word):
word_shape = 6 #'camelcase'
word_shape = 6 # 'camelcase'
elif re.match('[A-Za-z]+$', word):
word_shape = 7 #'mixedcase'
word_shape = 7 # 'mixedcase'
elif re.match('__.+__$', word):
word_shape = 8 # 'wildcard'
word_shape = 8 # 'wildcard'
elif re.match('[A-Za-z0-9]+\.$', word):
word_shape = 9 # 'ending-dot'
word_shape = 9 # 'ending-dot'
elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word):
word_shape = 10 # 'abbreviation'
word_shape = 10 # 'abbreviation'
elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word):
word_shape = 11 #'contains-hyphen'
word_shape = 11 # 'contains-hyphen'

return word_shape


def _extract_lexical(horus: Horus) -> Horus:

try:
lx_dict = WordFeaturesInterface.get_lexical()
tot_slide_brown_cluster = 5
Expand All @@ -70,10 +72,13 @@ def _extract_lexical(horus: Horus) -> Horus:
brown_640_path = '{:<016}'.format(dict_brown_c640.get(token.text, '0000000000000000'))
brown_320_path = '{:<016}'.format(dict_brown_c320.get(token.text, '0000000000000000'))

for i in range(0, tot_slide_brown_cluster-1):
token.features.lexical.values[lx_dict_reversed.get('brown_1000.' + str(i+1))] = brown_1000_path[:i+1]
token.features.lexical.values[lx_dict_reversed.get('brown_640.' + str(i+1))] = brown_640_path[:i+1]
token.features.lexical.values[lx_dict_reversed.get('brown_320.' + str(i+1))] = brown_320_path[:i+1]
for i in range(0, tot_slide_brown_cluster - 1):
token.features.lexical.values[lx_dict_reversed.get('brown_1000.' + str(i + 1))] = brown_1000_path[
:i + 1]
token.features.lexical.values[lx_dict_reversed.get('brown_640.' + str(i + 1))] = brown_640_path[
:i + 1]
token.features.lexical.values[lx_dict_reversed.get('brown_320.' + str(i + 1))] = brown_320_path[
:i + 1]

token.features.lexical.values[lx_dict_reversed.get('word.lower')] = token.text.lower()

Expand All @@ -92,7 +97,8 @@ def _extract_lexical(horus: Horus) -> Horus:
token.features.lexical.values[lx_dict_reversed.get('word.lemma')] = lemma
token.features.lexical.values[lx_dict_reversed.get('word.stem')] = stem
token.features.lexical.values[lx_dict_reversed.get('word.len.1')] = int(len(token.text) == 1)
token.features.lexical.values[lx_dict_reversed.get('word.has.special')] = int(len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token.text)) > 0)
token.features.lexical.values[lx_dict_reversed.get('word.has.special')] = int(
len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token.text)) > 0)
token.features.lexical.values[lx_dict_reversed.get('word[0].isupper')] = int(token.text[0].isupper())
token.features.lexical.values[lx_dict_reversed.get('word.isupper')] = int(token.text.isupper())
token.features.lexical.values[lx_dict_reversed.get('word.istitle')] = int(token.text.istitle())
Expand Down Expand Up @@ -143,14 +149,19 @@ def extract_features(horus: Horus, lexical: bool = False, text: bool = False, im
return True

except Exception as e:
config.logger.error(str(e))
return e
config.logger.exception(str(e))
return False


if __name__ == '__main__':

config = HorusConfig()

# define the feature sets you want to extract
EXTRACT_LEXICAL = True
EXTRACT_TEXT = False
EXTRACT_IMAGE = False

config.logger.info('loading lemmatizers')
stemmer = SnowballStemmer('english')
stop = set(stopwords.words('english'))
Expand Down Expand Up @@ -179,13 +190,14 @@ def extract_features(horus: Horus, lexical: bool = False, text: bool = False, im
try:
conll_file = ds[1] + ds[2]
assert '.horusx' in conll_file
horus_file_stage2 = conll_file.replace('.horusx', '.horus1.json')
horus_file_stage2 = conll_file.replace('.horusx', '.horus2.json')

config.logger.info('loading horus file: ' + horus_file_stage2)
horus = HorusDataLoader.load_metadata_from_file(file=horus_file_stage2)

config.logger.info('feature extraction')
ok = extract_features(horus, lexical=True)
config.logger.info(f'feature extraction: '
f'lexical: {EXTRACT_LEXICAL}, text: {EXTRACT_TEXT}, image: {EXTRACT_IMAGE}')
ok = extract_features(horus, lexical=EXTRACT_LEXICAL, text=EXTRACT_TEXT, image=EXTRACT_IMAGE)
if not ok:
config.logger.warn('feature extraction: something went wrong...')

Expand Down
53 changes: 52 additions & 1 deletion src/horus_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,58 @@ def get_visual() -> dict:
@staticmethod
def get_textual() -> dict:
return {
0: 'blah'
0: 'total.global.results.search_engine',
1: 'total.retrieved.results.search_engine',
2: 'total.error.translation',
3: 'total.binary.k.loc',
4: 'total.binary.k.org',
5: 'total.binary.k.per',
6: 'total.binary.k.other',
7: 'top.binary.k',
8: 'dist.k',
9: 'total.topic.k.loc',
10: 'total.topic.k.org',
11: 'total.topic.k.per',
12: 'total.topic.k.other',
13: 'top.topic.k',
14: 'dist.k.topic_model',
15: 'total.emb.similar.loc',
16: 'total.emb.similar.org',
17: 'total.emb.similar.per',
18: 'total.emb.similar.other',
19: 'stats.topic.top.k.sum.loc',
20: 'stats.topic.top.k.sum.org',
21: 'stats.topic.top.k.sum.per',
22: 'stats.topic.top.k.sum.other',
23: 'stats.topic.top.k.avg.loc',
24: 'stats.topic.top.k.avg.org',
25: 'stats.topic.top.k.avg.per',
26: 'stats.topic.top.k.avg.other',
27: 'stats.topic.top.k.max.loc',
28: 'stats.topic.top.k.max.org',
29: 'stats.topic.top.k.max.per',
30: 'stats.topic.top.k.max.other',
31: 'stats.topic.top.k.min.loc',
32: 'stats.topic.top.k.min.org',
33: 'stats.topic.top.k.min.per',
34: 'stats.topic.top.k.min.other',
35: 'stats.topic.sum.loc',
36: 'stats.topic.sum.org',
37: 'stats.topic.sum.per',
38: 'stats.topic.sum.other',
39: 'stats.topic.avg.loc',
40: 'stats.topic.avg.org',
41: 'stats.topic.avg.per',
42: 'stats.topic.avg.other',
43: 'stats.topic.max.loc',
44: 'stats.topic.max.org',
45: 'stats.topic.max.per',
46: 'stats.topic.max.other',
47: 'stats.topic.min.loc',
48: 'stats.topic.min.org',
49: 'stats.topic.min.per',
50: 'stats.topic.min.other',

}

@staticmethod
Expand Down

0 comments on commit c5fd413

Please sign in to comment.