-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocessing.py
385 lines (319 loc) · 14.1 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import *
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import webtext
from nltk.probability import FreqDist
tqdm.pandas()
'''Lowercasing'''
def prep_lowercase(df, col = 'Tweet'):
"""
Lowercase the sentences
Inputs:
- df (DataFrame) : dataframe containing the tweets
- col (string) : name of the column to lowercase by default to 'Tweet'
Outputs:
- df_new (Serie) : pandas Serie containing the sentence lowercased
"""
df_new = df['Tweet'].progress_apply(lambda x : x.lower())
return df_new
''' Removing contractions'''
#The contraction were obtained on this website that got them from Wikipedia
#https://gist.githubusercontent.com/Sirsirious/c70400176a4532899a483e06d72cf99e/raw/e46fa7620c4f378f5bf39608b45cddad7ff447a4/english_contractions.json
def load_contractions(PATH_PREPROCESSING):
"""
Loads the contraction list
Inputs:
- PATH_PREPROCESSING (string): string containing the path of the folder in which is the contraction list
Outputs:
- contraction_list (dict) : dictionnary containing the contractions as key and their corresponding English word as value
"""
contraction_list = json.loads(open(PATH_PREPROCESSING + 'english_contractions.json', 'r').read())
#adding two contractions found by scrolling through the tweets
contraction_list['ur'] = "your"
contraction_list['ya'] = "you"
return contraction_list
def remove_contractions(sentence, contraction_list):
"""
Removes the contractions from a sentence
Inputs:
- sentence (string): initial sentence with the contractions
- contraction_list (dict) : dictionnary containing the contractions as key and their corrsponding English word as value
Outputs:
-" ".join(sent_wo) (string) : string containing the tweet with the contractions replaced
"""
sent_split = sentence.split(" ")
sent_wo = [contraction_list[word] if word in contraction_list else word for word in sent_split]
return " ".join(sent_wo)
def remove_contractions_df(df, contraction_list, col = 'Tweet'):
"""
Removes the contractions from the whole dataframe containing the tweets
Inputs:
- df (DataFrame): dataframe containing the tweets
- contraction_list (dict) : dictionnary containing the contractions as key and their corrsponding English word as value
- col (string) : column containing the Tweet in which to replace the contractions by default to 'Tweet'
Outputs:
- df_new (pandas.Serie) : Serie containing the tweets without the contractions
"""
df_new = df[col].progress_apply(lambda x : remove_contractions(x, contraction_list))
return df_new
'''Removing slang'''
#The slang dictionnary was obtained on this website
#https://aclanthology.org/P11-1038/
def slang_dict_to_tuple(PATH_PREPROCESSING):
"""
Transforms a .dict file into a list of tuples
Inputs:
- PATH_PREPROCESSING (string): string containing the path to the folder containing the .dict file
Outputs:
- lines (list) : list of tuples containing the slang and their corresponding English word
"""
f = open(PATH_PREPROCESSING + 'slang.dict')
lines = []
for line in f: ## iterates over the lines of the file
lines.append(line)
f.close()
return lines
def slang_tuple_to_dict(lines):
#https://stackoverflow.com/questions/7100125/storing-python-dictionaries
"""
From a list of tuples creates a dictionnary
Inputs:
- lines (list): list of tuples containing the slang and their corresponding English word
Outputs:
- slang_dict (dict) : dictionnary containing as keys the sland and as values their corresponding English word
"""
slang_dict = {}
for i in range(0, len(lines), 2):
slang_dict[lines[i].replace('\n', '')] = lines[i+1].replace('\n', '').strip()
return slang_dict
def slang_tuple_to_json(slang_dict,PATH_PREPROCESSING):
"""
Creates a json file from a dictionnary
Inputs:
- slang_dict (dict) : dictionnary containing as keys the sland and as values their corresponding English word
- PATH_PREPROCESSING (string) : path to the folder where to store the created json
Outputs:
- print('file saved') : prints that it saved the file to PATH_PREPROCESSING +'slang.json'
"""
with open(PATH_PREPROCESSING + 'slang.json', 'w') as fp:
json.dump(slang_dict, fp)
return print('file saved')
def remove_slang(sentence, slang_list):
"""
Removes the slang from a sentence
Inputs:
- sentence (string) : initial sentence with the slang
- slang_dict (dictionnary) : dictionnary containing the slang and their corresponding English word
Outputs:
- " ".join(sent_wo) (string) : string containing the initial sentence with the slang removed
"""
sent_split = sentence.split(" ")
sent_wo = [slang_list[word] if word in slang_list else word for word in sent_split]
return " ".join(sent_wo)
def remove_slang_df(df, slang_list, col = 'Tweet'):
"""
Removes the slang from the whole dataframe
Inputs:
- df (DataFrame) : dataframe containing the Tweets with the slang
- slang_dict (dictionnary) : dictionnary containing the slang and their corresponding English word
- col (string) : column containing the Tweet in which to replace the slang by default to 'Tweet'
Outputs:
- df_new (Pandas.Serie) : series containing all the tweets without the slang
"""
df_new = df[col].progress_apply(lambda x : remove_slang(x, slang_list=slang_list))
return df_new
'''Remove stopwords'''
#https://stackabuse.com/removing-stop-words-from-strings-in-python/
def load_stopwords():
"""
Loads the stopwords
Inputs:
Outputs:
- all_stopwords (list) : list containing all the english stopwords downloaded from nltk
"""
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
#appending to the list two specific to our dataset stop words that we saw by scrolling through the dataset
all_stopwords.append('<user>')
all_stopwords.append('<url>')
return all_stopwords
def remove_stopwords(sentence, all_stopwords):
"""
Removes the stopwords from a sentence
Inputs:
- sentence (string) : sentence containing stopwords
- all_stopwords (list) : list containing the stop words
Outputs:
- no_sw (string) : string containing the sentence without the stopwords
"""
#https://stackabuse.com/removing-stop-words-from-strings-in-python/
sent_split = sentence.split(" ")
all_stopwords = set(all_stopwords) # set() heremakes it at least 10x faster
sent_without_sw = [word for word in sent_split if not word in all_stopwords]
no_sw = " ".join(sent_without_sw)
return no_sw
def remove_stopwords_df(df, all_stopwords, col = 'Tweet'):
"""
Removes the stopwords from an entire dataframe
Inputs:
- df (DataFrame) : DataFrame containing the tweets with the stopwords
- all_stopwords (list) : list containing the stop words
- col (string) : column containing the Tweet in which to remove the stopwords by default to 'Tweet'
Outputs:
- df_new (Pandas.Serie) : serie containing the tweets without the stop-words
"""
df_new = df[col].progress_apply(lambda x : remove_stopwords(x, all_stopwords=all_stopwords))
return df_new
'''Stemming'''
def stem_sent(sentence, stemmer):
"""
Stems a sentence
Inputs:
- sentence (string) : string containing the tweet not stemmed
- stemmer (nltk.Stemmer) : stemmer from the nltk library
Outputs:
- stemming (string) : sentence stemmed
"""
#https://www.nltk.org/howto/stem.html
sent_split = sentence.split(" ")
sent = [stemmer.stem(w) for w in sent_split]
stemming = " ".join(sent)
return stemming
def stem_sent_df(df, stemmer, col = 'Tweet'):
"""
Stems the entire dataframe containing the tweets
Inputs:
- df (DataFrame) : dataframe containing the tweets to stem
- stemmer (nltk.Stemmer) : stemmer from the nltk library
- col (string) : column containing the Tweet to stem by default to 'Tweet'
Outputs:
- df_new (Pandas.Serie) : serie containing the tweets stemmed
"""
df_new = df[col].progress_apply(lambda x : stem_sent(x,stemmer=stemmer))
return df_new
'''Removing punctuation'''
def remove_punct_df(df, tokenizer, col = 'Tweet'):
"""
Removes the punctuation in the entire dataframe
Inputs:
- df (DataFrame) : dataframe containing the tweets to stem
- tokenizer (nltk.RegexpTokenizer) : tokenizer from the nltk library that removes the punctuation
- col (string) : column containing the Tweet in which to replace the punctation by default to 'Tweet'
Outputs:
- df_new (Pandas.Serie) : serie containing the tweets without the punctuation
"""
#https://www.kite.com/python/answers/how-to-remove-all-punctuation-marks-with-nltk-in-python
return df[col].progress_apply(lambda x :" ".join(tokenizer.tokenize(x)))
'''Removing less frequent words'''
def word_occ_tofile(PATH_PREPROCESSING, df,file_name = 'count_word_pos.txt', col ='Tweet'):
"""
Writes into a file all the words in order to count them later
Inputs:
- PATH_PREPROCESSING (string) : path to the folder where to store the number of times each word appears
- df (DataFrame) : dataframe containing the tweets
- file_name (string) : by default to 'count_word_pos.txt', name of the file in which to store the count of the words
- col (string) : column containing the Tweet in which to replace the contractions by default to 'Tweet'
Outputs:
"""
#https://www.geeksforgeeks.org/reading-writing-text-files-python/
with open(PATH_PREPROCESSING + file_name, "w+") as fo :
for elem in df[col].tolist():
fo.write(elem + "\n")
def low_occuring_words(PATH_PREPROCESSING, file_name = 'count_word_pos.txt' ):
"""
Filters the words that are appearing less than 5 times in the whole corpus
Inputs :
- PATH_PREPROCESSING (string) : path to the folder containing the file with the frequency of the words
- file_name (string) : name of the file that has the counts by default to 'count_word_pos.txt'
Outputs :
- filter_words (list) : list containing the words that appear 5 or more times in the corpus
"""
#https://www.geeksforgeeks.org/python-count-occurrences-of-each-word-in-given-text-file-using-dictionary/
d = dict()
txt = open(PATH_PREPROCESSING + file_name, "r")
for line in txt:
# Remove the leading spaces and newline character
line = line.strip()
# Split the line into words
words = line.split(" ")
# Iterate over each word in line
for word in words:
# Check if the word is already in dictionary
if word in d:
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
#Takes word only if they appear more than 5 times
filter_words = [word for word, count in d.items() if count>=5]
return filter_words
#Function to remove words in sentence
def remove_low_words(sentence, lst):
"""
Removes the words that are appearing less than 5 times in a sentence
Inputs :
- sentence (string) : sentence in which we need to remove the low appearing words
- lst (list) : list containing the high frequency appearing words
Outputs :
- " ".join(sent) (string) : string containing the tweets without the words appearing very few times
"""
sent_split = sentence.split(" ")
sent = [word for word in sent_split if word in lst]
return " ".join(sent)
def remove_low_words_df(df, lst, col = 'Tweet' ):
"""
Removes the low appearing words in the whole dataframe
Inputs :
- df (DataFrame) : dataframe containing the tweets where to remove the low-appearing words
- lst (list) : list containing the low appearing words
- col (string) : name of the column in which to remove the words, by default to 'Tweet'
Outputs :
- df_new (Pandas.Serie) : serie containing the tweets without the low-appearing words
"""
lst = set(lst) # set makes it a lot faster
df_new = df[col].progress_apply(lambda x : remove_low_words(x, lst))
return df_new
def preprocessing(df, contraction_list, stemmer, filter_words, slang_list, all_stopwords, tokenizer):
'''
All preprocessing steps combined
Inputs :
- df (DataFrame) : dataframe containing the tweets
- contraction_list (dict) : dictionnary containing the english contractions and their corresponding word
- stemmer (nltk.Stemmer) : stemmer from nltk library to stem the tweets
- slang_list (dict) : dictionnary containing the slang and their corresponding English word
- all_stopwords(list) : list containing the English stopwrods
- tokenizer (nltk.RegexpTokenizer) : tokenizer from the nltk library that removes the punctuation
Outputs :
- df (DataFrame) : DataFrame contraining one column with the tweets pre-processed with every step
'''
df['Tweet'] = prep_lowercase(df)
df['Tweet'] = remove_contractions_df(df, contraction_list)
df['Tweet'] = remove_slang_df(df, slang_list)
df['Tweet'] = prep_lowercase(df)
df['Tweet'] = remove_stopwords_df(df, all_stopwords)
df['Tweet'] = stem_sent_df(df,stemmer)
df['Tweet'] = remove_punct_df(df, tokenizer)
df['Tweet'] = remove_low_words_df(df, lst = filter_words, col = 'Tweet' )
return df
"""
N-Grams
"""
def generate_N_grams(words, ngram=1):
"""
Method to generate the N_grams of each word
Inputs :
- word (list) : list of words in a tweet
- ngram (int) : number of ngrams to generate by default to 1
Outpus :
- ans (list) : list containing the words and their n-gram
"""
#https://www.analyticsvidhya.com/blog/2021/09/what-are-n-grams-and-how-to-implement-them-in-python/
temp=zip(*[words[i:] for i in range(0,ngram)])
ans=[' '.join(ngram) for ngram in temp]
return ans