-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfunctions.py
228 lines (178 loc) · 8.08 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
class Timer():
'''A timer used to record how long a process takes.
After instaniating, a .start() and .stop() can be used
before and after a process in respective order.'''## def init
def __init__(self,format_="%m/%d/%y - %I:%M %p"):
import tzlocal
self.tz = tzlocal.get_localzone()
self.fmt = format_
self.created_at = self.get_time()# get time
## def get time method
def get_time(self):
import datetime as dt
return dt.datetime.now(self.tz)
## def start
def start(self):
time = self.get_time()
self.start = time
print(f"[i] Timer started at{self.start.strftime(self.fmt)}")
## def stop
def stop(self):
time = self.get_time()
self.end = time
print(f"[i] Timer ended at {self.end.strftime(self.fmt)}")
print(f"- Total time = {self.end-self.start}")
timer = Timer()
print(timer.created_at)
timer.start()
timer.stop()
def process_comment(text):
'''A pre-processing function that cleans text of stopwords, punctuation and capitalization, tokenizes
then finds the most frequently used 100 words
text - the text to be cleaned in string format'''
# Get all the stop words in the English language
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
#remove punctuation
stopwords_list += list(string.punctuation)
##adding adhoc all strings that don't appear to contribute, added 'article, page and wikipedia' iteratively as
##these are parts of most comment strings
stopwords_list += ("''","``", "'s", "\\n\\n" , '...', 'i\\','\\n',
'•', "i", 'the', "'m", 'i\\', "'ve", "don\\'t",
"'re", "\\n\\ni", "it\\", "'ll", 'you\\', "'d", "n't",
'’', 'article', 'page', 'wikipedia')
from nltk import word_tokenize
tokens = word_tokenize(text)
stopped_tokens = [w.lower() for w in tokens if w.lower() not in stopwords_list]
freqdist = FreqDist(stopped_tokens)
most_common_stopped = freqdist.most_common(100)
return most_common_stopped
def clean_up(freq_tox):
'''Takes the most frequently used and highly offensive words and replaces them with
edited versions
freq_tox - takes a dictionary
'''
## creating a dictionary of the most offensive words
replace = {'fuck': 'f$%!', 'nigger' : "n*###%" ,'nigga':'n#5#*', 'fucking' : 'f*@%!ng',
'faggot':'f@&&*#', 'cunt' : 'c&#^' , 'fag' : 'f@$',
"'fuck" : "'f$%!'", 'faggots':'f@&&*!$'}
## That dictionary doesn't render in word cloud so have two versions..
##creating a dictionary of primary offensive words to clean up a visual. These are highly offensive and
##I am completely uncomfortable even typing them for this purpose, but needs to be done.
# dic = {'fuck': 'fword', 'nigger' : "nword" , 'nigga':'nwordderivatie','fucking' : 'fwordderivative',
# 'faggot':'fwordforgay', 'cunt' : 'cword' ,'cunts' : 'cwords', 'shit': 'shword', 'fag' : 'shortenedfwordforgay',
# "'fuck" : "'fword'", 'faggots':'fwordforgays'}
#using the 'replace' dictionary above,
new_dict = {}
for k, v in dict(freq_tox).items():
if k in replace:
key = replace[k]
else:
key = k
new_dict[key] = v
cleaned_list = [ [k,v] for k, v in new_dict.items() ]
return cleaned_list
def replace_all(lst, dic):
'''another version of cleaning a list of words using a dictionary
this function is more flexible than clean_up as the list and
dictionary can be used on the fly
lst - a list of words separated by commas
dictionary - dictionary with key value pairs
'''
new_lst = []
for st in lst:
for i, j in dic.items():
st = st.replace(i, j)
new_lst.append(st)
return new_lst
def wrd_cld(toks):
'''Function to visualize word frequency using tokens and is particular to findings of the
corpus used in the toxic words challenge from Kaggle.
toks - tokens rendered from tokinization'''
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import matplotlib.pyplot as plt
# Get all the stop words in the English language
stopwords_list = stopwords.words('english')
#remove punctuation
stopwords_list += list(string.punctuation)
##adding adhoc all strings that don't appear to contribute, added 'article, page and wikipedia' iteratively as
##these are parts of most comment strings
stopwords_list += ("''","``", "'s", "\\n\\n" , '...', 'i\\','\\n',
'•', "i", 'the', "'m", 'i\\', "'ve", "don\\'t",
"'re", "\\n\\ni", "it\\", "'ll", 'you\\', "'d", "n't",
'’', 'article', 'page', 'wikipedia')
import wordcloud
from wordcloud import WordCloud
wordcloud = WordCloud(stopwords=stopwords_list,collocations=False)
wordcloud.generate(','.join(toks))
plt.figure(figsize = (12, 12), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
def plot_acc_loss(history):
'''plotting function to visualize performance history of deep learning that
compares training data with validation data - metrics are accuracy and loss.
history - the output of a the .fit() function in predictive modeling.'''
import matplotlib.pyplot as plt
# %matplotlib inline
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) +1)
plt.plot(epochs, acc, label='Training accuracy')
plt.plot(epochs, val_acc,color='g', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, color='g' , label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
def tok_text(text):
'''Function to tokenize text from a string
text - text in a string format
'''
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
# Get all the stop words in the English language
stopwords_list = stopwords.words('english')
#remove punctuation
stopwords_list += list(string.punctuation)
##adding adhoc all strings that don't appear to contribute, added 'article, page and wikipedia' iteratively as
##these are parts of most comment strings
stopwords_list += ("''","``", "'s", "\\n\\n" , '...', 'i\\','\\n',
'•', "i", 'the', "'m", 'i\\', "'ve", "don\\'t",
"'re", "\\n\\ni", "it\\", "'ll", 'you\\', "'d", "n't",
'’', 'article', 'page', 'wikipedia')
tokens = word_tokenize(text)
stopped_tokens = [w.lower() for w in tokens if w.lower() not in stopwords_list]
return(stopped_tokens)
def freq_dist(tokens, n=100):
'''Function to discover the most frequently used words
in a corpus. Default is 100.
tokens - takes tokenized words as input
returns a list of tuples displaying the words and associated count'''
from nltk import FreqDist
freqdist = FreqDist(tokens)
most_common_stopped = freqdist.most_common(n)
return most_common_stopped
def class_report_model(y_train,y_test, y_preds):
'''creates a confusion matrix and classification model
using training testing and predictions from a RNN classification model
y_train - training data in tokenized form
y_test - testing data in tokeinzed form
y_preds - multinomial classification predictions in DataFrame form'''
from sklearn.metrics import classification_report, confusion_matrix
for i in range(0,y_train.shape[1]):
y_i_hat_trnn = y_preds.iloc[:,i]
y_tst = y_test.iloc[:,i]
print(y_train.columns[i])
print(confusion_matrix(y_tst, y_i_hat_trnn, normalize='true'))
print()
print(classification_report(y_tst, y_i_hat_trnn))