-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict_list_factors.py
83 lines (63 loc) · 2.76 KB
/
predict_list_factors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from sklearn import cross_validation
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RandomizedLogisticRegression
import numpy as np
import pandas as pd
import sys
class PredictListFactors(object):
""" This class predicts a discrete value and determines importance of individual factors
Parameters
----------
df : pandas dataframe
predicted : y column (discrete) who's values are being predicted
Returns
-------
self : object
"""
def __init__(self, df, predictedcol, idcol, testsplitcol):
self.df = df
self.predictedcol = predictedcol
self.idcol = idcol
# remove columns that are objects (ie not numbers)
# todo: convert objects to discrete numbers
cols = (self.df.dtypes != object)
self.df = self.df[cols[cols].index]
# remove id column from ML, but will use in display method below (in self.df)
# Note that self.df is not updated here!
collist = self.df.columns.tolist()
collist.remove(idcol)
df = self.df[collist]
# Split the dataset using by conditional on testsplitcol
self.y_train = df[self.predictedcol][df[testsplitcol] == 5]
del df[self.predictedcol]
self.X_train = df[df[testsplitcol] == 5]
self.X_test = df[df[testsplitcol] != 5]
#grab column names for ML
self.names = list(df)
# todo: use property so parameter accepts only certain imputation strategy and classifier
def predictfactors(self):
pipeline = Pipeline([("imputer", Imputer(strategy='mean', axis=0)),
("logistic", LogisticRegression())
])
predict = pipeline.fit(self.X_train, self.y_train).predict(self.X_test)
firstfactor=[]
secondfactor=[]
thirdfactor=[]
res = np.array(pipeline.named_steps['logistic'].coef_ * self.X_test.iloc[[0]])
threemaxindexes = np.array((-res).argsort().ravel())
#[3 1 5 4 2 6 0]
print(res)
print(threemaxindexes)
print(self.names)
#sys.exit()
for i in range(0,len(self.X_test)):
res = np.array(pipeline.named_steps['logistic'].coef_ * self.X_test.iloc[[i]])
threemaxindexes = np.array((-res).argsort().ravel())
firstfactor.append(self.names[threemaxindexes[0]])
secondfactor.append(self.names[threemaxindexes[1]])
thirdfactor.append(self.names[threemaxindexes[2]])
for i in range(0,len(self.X_test)):
print([self.df[self.idcol][i], predict[i], firstfactor[i], secondfactor[i], thirdfactor[i]])