-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword_importance.py
78 lines (64 loc) · 2.59 KB
/
word_importance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#-----------------------------
#
# This script runs a Random Forest
# on the recipe data by WORDS (not ingredients),
# and then spits out the feature importance
#
#-----------------------------
# Import libraries
import numpy as np
import random
import logging
import json
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse
import matplotlib.pyplot as plt
# Set Working Directory
os.chdir('/home/nick/Documents/Kaggle/Recipes/')
##----Format Logging-----
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
##----Load Data-----
with open('train.json') as data_file:
train = json.load(data_file)
##----ETL----
# Create joined recipe string
# Create documents
for entry in train:
entry['ingredient_string'] = ' '.join(entry['ingredients'])
#-----Create list of ingredients-----
ingredient_list = [x['ingredient_string'] for x in train]
target = [x['cuisine'] for x in train]
#---Split Train set into train-train and train-test---
# Not sure we need the following because we are doing a RF
# on the whole training set. But we can always split as follows...
indices = range(len(ingredient_list))
percent = 0.8
random.shuffle(indices)
how_many = int(np.round(percent * len(train)))
train_train_indices = indices[0:how_many]
train_test_indices = indices[how_many:]
train_train = [x for ix, x in enumerate(ingredient_list) if ix in train_train_indices]
train_test = [x for ix, x in enumerate(ingredient_list) if ix in train_test_indices]
train_train_target = [x for ix, x in enumerate(target) if ix in train_train_indices]
train_test_target = [x for ix, x in enumerate(target) if ix in train_test_indices]
#----Bag of words transform-----
vectorizer = CountVectorizer(analyzer = 'word',
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features = None)
# trained_features = vectorizer.fit_transform(train_train)
trained_features = vectorizer.fit_transform(ingredient_list)
#-----Train Random Forest-----
recipe_forest = RandomForestClassifier(n_estimators = 100)
recipe_forest = recipe_forest.fit(trained_features, target)
#----Feature Importance-----
importances = recipe_forest.feature_importances_
ind_sort = np.argsort(importances)[::-1]
importances = importances[ind_sort]
ingredient_names = vectorizer.get_feature_names()
ingredient_names = [ingredient_names[x] for x in ind_sort]
# Plot feature importances
plt.bar(range(len(importances)), importances)