forked from justmarkham/DAT5
-
Notifications
You must be signed in to change notification settings - Fork 1
/
21_ensembles_example.py
110 lines (81 loc) · 3.44 KB
/
21_ensembles_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
'''
Imports
'''
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
'''
Define a function that takes in a raw CSV file and returns a DataFrame that
includes all created features (and any other modifications). That way, we
can apply the same changes to both train.csv and test.csv.
'''
# Define the function
def make_features(filename):
# Read in dataframe
df = pd.read_csv(filename, index_col=0)
#Rename columns
df.rename(columns={'OwnerUndeletedAnswerCountAtPostTime':'Answers'}, inplace=True)
# Get length of title of post
df['TitleLength'] = df.Title.apply(len)
# Get length of body of post
df['BodyLength'] = df.BodyMarkdown.apply(len)
# Number of tags for post
df['NumTags'] = df.loc[:, 'Tag1':'Tag5'].notnull().sum(axis=1)
# Is the title lowercase?
df['TitleLowercase'] = (df.Title.str.lower() == df.Title).astype(int)
# Create features that represent whether Title contains certain words
df['TitleQuestion'] = df.Title.str.contains('question', case=False).astype(int)
df['TitleNeed'] = df.Title.str.contains('need', case=False).astype(int)
df['TitleHelp'] = df.Title.str.contains('help', case=False).astype(int)
return df
# Apply function to the training data
train = make_features('train.csv')
X = train.drop('OpenStatus', axis=1)
y = train.OpenStatus
# Read in test data
test = make_features('test.csv')
# Split into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
'''
Five feature logistic regression model
'''
# Define feature cols
feature_cols_logreg = ['ReputationAtPostCreation', 'Answers', 'TitleLength', 'BodyLength', 'NumTags']
# Perform cross validation to get an idea of the performance of the model
logreg = LogisticRegression()
-cross_val_score(logreg, X[feature_cols_logreg], y, scoring="log_loss", cv=5).mean()
# Predict class probabilities for the actual testing data
logreg.fit(X[feature_cols_logreg], y)
y_prob_logreg = logreg.predict_proba(test[feature_cols_logreg])[:, 1]
'''
Five feature random forest model
'''
# Define feature cols
feature_cols_rf = ['TitleLowercase', 'TitleQuestion', 'TitleNeed', 'TitleHelp']
# Perform cross validation to get an idea of the performance of the model
rf = RandomForestClassifier()
-cross_val_score(rf, X[feature_cols_rf], y, scoring="log_loss", cv=5).mean()
# Predict class probabilities for the actual testing data
rf.fit(X[feature_cols_rf], y)
y_prob_rf = rf.predict_proba(test[feature_cols_rf])[:, 1]
'''
Text logistic regression model on 'Title' using pipeline
'''
# Make pipleline
pipe = make_pipeline(CountVectorizer(stop_words='english'), LogisticRegression())
# Perform cross validation to get an idea of the performance of the model
-cross_val_score(pipe, X['Title'], y, scoring="log_loss", cv=5).mean()
# Predict class probabilities for the actual testing data
pipe.fit(X['Title'], y)
y_prob_pipe = pipe.predict_proba(test['Title'])[:, 1]
'''
Create submission
'''
# Ensemble predictions
y_prob_combined = (y_prob_logreg + y_prob_rf + 2*y_prob_pipe) / 3
# Create a DataFrame that has 'id' as the index, then export to a CSV file
sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob_combined}).set_index('id')
sub.to_csv('sub_ensemble.csv')