Skip to content

Commit

Permalink
#51
Browse files Browse the repository at this point in the history
- text features dict
- feat extraction
- bash update (spacy models)
  • Loading branch information
diegoesteves committed Jun 10, 2020
1 parent 2156149 commit 6368989
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 91 deletions.
3 changes: 3 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,13 @@ def __init__(self):
self.models_cv_org_dict = self.dir_models + parser.get('models-cv', 'horus_org_voc')
self.models_cv_per = self.dir_models + parser.get('models-cv', 'horus_per')

self.categories_encoder = self.dir_models + parser.get('models-text', 'horus_textchecking_enc')
self.models_tfidf = self.dir_models + parser.get('models-text', 'horus_textchecking_tfidf')
self.models_0_text = self.dir_models + parser.get('models-text', 'horus_textchecking_0')
self.models_1_text = self.dir_models + parser.get('models-text', 'horus_textchecking_1')
self.models_2_text = self.dir_models + parser.get('models-text', 'horus_textchecking_2')


self.models_1_text_cnn = self.dir_models + parser.get('models-text', 'horus_texthecking_tm_cnn')

self.model_final = self.dir_models + parser.get('models-horus', 'horus_final')
Expand Down
80 changes: 56 additions & 24 deletions notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
},
{
"cell_type": "code",
"execution_count": 244,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -62,7 +62,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -78,7 +78,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -155,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -232,7 +232,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -241,7 +241,7 @@
"\"\\ndf_other2 = pd.read_csv('./data/raw/dump_dbpedia_other_02.csv', sep='\\t', index_col=0)\\ndf_other3 = pd.read_csv('./data/raw/dump_dbpedia_other_03.csv', sep='\\t', index_col=0)\\ndf_other4 = pd.read_csv('./data/raw/dump_dbpedia_other_04.csv', sep='\\t', index_col=0)\\ndf_other5 = pd.read_csv('./data/raw/dump_dbpedia_other_05.csv', sep='\\t', index_col=0)\\ndf_other6 = pd.read_csv('./data/raw/dump_dbpedia_other_06.csv', sep='\\t', index_col=0)\\ndf_other7 = pd.read_csv('./data/raw/dump_dbpedia_other_07.csv', sep='\\t', index_col=0)\\ndf_other8 = pd.read_csv('./data/raw/dump_dbpedia_other_08.csv', sep='\\t', index_col=0)\\ndf_other9 = pd.read_csv('./data/raw/dump_dbpedia_other_09.csv', sep='\\t', index_col=0)\\ndf_other10 = pd.read_csv('./data/raw/dump_dbpedia_other_10.csv', sep='\\t', index_col=0)\\n\""
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -305,7 +305,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -398,7 +398,7 @@
"4 Nikos Ventouras (August 31, 1899 – April 1, 19... "
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -416,7 +416,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -470,7 +470,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -483,7 +483,7 @@
},
{
"cell_type": "code",
"execution_count": 251,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -497,7 +497,7 @@
"Name: s, dtype: int64"
]
},
"execution_count": 251,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -511,7 +511,7 @@
},
{
"cell_type": "code",
"execution_count": 252,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -524,7 +524,7 @@
},
{
"cell_type": "code",
"execution_count": 253,
"execution_count": 12,
"metadata": {},
"outputs": [
{
Expand All @@ -533,7 +533,7 @@
"Index(['PER', 'ORG', 'LOC', 'OTHER'], dtype='object')"
]
},
"execution_count": 253,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -544,7 +544,7 @@
},
{
"cell_type": "code",
"execution_count": 255,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -553,7 +553,7 @@
},
{
"cell_type": "code",
"execution_count": 256,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -614,7 +614,7 @@
"52292 OTHER 3"
]
},
"execution_count": 256,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -625,7 +625,7 @@
},
{
"cell_type": "code",
"execution_count": 257,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand All @@ -634,7 +634,7 @@
"['encoder_4MUC_cat2id_id2cat.joblib']"
]
},
"execution_count": 257,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -647,7 +647,7 @@
},
{
"cell_type": "code",
"execution_count": 273,
"execution_count": 16,
"metadata": {},
"outputs": [
{
Expand All @@ -656,7 +656,7 @@
"{'PER': 0, 'ORG': 1, 'LOC': 2, 'OTHER': 3}"
]
},
"execution_count": 273,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -667,7 +667,7 @@
},
{
"cell_type": "code",
"execution_count": 274,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -676,7 +676,7 @@
"{0: 'PER', 1: 'ORG', 2: 'LOC', 3: 'OTHER'}"
]
},
"execution_count": 274,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -685,6 +685,26 @@
"idx2category"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'PER'"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx2category[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -1554,6 +1574,18 @@
"display_name": "horus",
"language": "python",
"name": "horus"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 6368989

Please sign in to comment.