-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
469 lines (389 loc) · 19.6 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
import re
regular_expression = re.compile('[' + re.escape('!@#$^&*()+=-_,./:;<>?"[\\]^_`{|}~')+'\\r\\t\\n]')
def turn_count(data):
assert type(data) == type([]), "Data is not a list."
return len(data)
def word_count(data, special_characters_rm = regular_expression):
assert type(data) == type([]) or type(data) == type('Document'), "Data is not a list or string"
assert type(special_characters_rm) == re.Pattern, "Your special characters are not a regex Pattern object"
wordcount = 0
if type(data) == type([]):
wordcount = 0
for speaker_turn in data:
sentences = sent_tokenize(speaker_turn)
for sentence in sentences:
de_punct_sentence = re.sub(special_characters_rm, ' ', sentence)
wordcount += len(de_punct_sentence.split())
return wordcount
else:
wordcount = 0
sentences = sent_tokenize(data)
for sentence in sentences:
de_punct_sentence = re.sub(special_characters_rm, ' ', sentence)
wordcount += len(de_punct_sentence.split())
return wordcount
def speaker_count(data):
assert type(data) == type([]), "Data is not a list."
unique_values = pd.Series(data)
unique_set = set(unique_values)
for speaker in unique_values:
name = speaker.split()
if len(name) == 1:
for other_speaker in list(unique_set - {name[0]}):
if name[0] in other_speaker:
unique_set = unique_set - {other_speaker}
return len(unique_set)
def normalize_data(data, keep_nonascii = False, keep_punctuation = False, remove_extra_spaces = True, lowercase = True, word_length = None):
"""
Performs specific cleaning operations on the data, as long as the data is a string.
:param data: string representing review
:param keep_nonascii: boolean value representing whether to keep non-ascii characters or not
:param keep_punctuation: boolean value representing whether to keep punctuation or not
:param remove_extra_spaces: boolean value representing whether to remove extra spaces between, before, and after words
:param lowercase: boolean value representing whether to lowercase data or not
:param word_length: if None, word length is not taken into account. If it is a number, only words greater than or equal to specific length will be kept.
:return: normalized string
"""
assert type(data) == type('Document'),"Your data is not a string."
temp_data = data
if not keep_nonascii:
temp_data = temp_data.encode('ascii', errors = 'ignore')
if not keep_punctuation:
regular_expression = re.compile('[' + re.escape('!@#$^&*\'()+=-_,./:;<>?"[\\]^_`{|}~')+'0-9\\r\\t\\n]')
temp_data = regular_expression.sub(' ', temp_data.decode('utf-8'))
try:
if type(temp_data) != type('string'):
temp_data = temp_data.decode('utf-8')
except:
pass
if remove_extra_spaces:
split_data = temp_data.split(' ')
just_words = [word.strip() for word in split_data if word not in ['','\r','\t',' ']]
temp_data = ' '.join(split_data)
if lowercase:
temp_data = temp_data.lower()
if word_length:
assert type(word_length) == type(5)
split_data = temp_data.split(' ')
select_words = [word.strip() for word in split_data if len(word.strip())>=word_length]
temp_data = ' '.join(select_words)
return temp_data
def tokenize(data):
"""
Function to tokenize data. Best used AFTER stopword removal, contraction replacement, and normalization.
Best used before lemmatization and POS tagging.
:param data: string representing review
:return: tokenized string as list of tokens
"""
assert type(data) == type('Document'), "Your data is not a string."
return word_tokenize(data)
def replace_contractions(data, contractions):
"""
Function to replace contractions in data. Best used BEFORE normalization.
:param data: string representing review
:param contractions: dictionary of predefined contractions and their expansions
:return: string with contractions replaced
"""
assert type(data) == type('Document'), "Your data is not a string."
assert type(contractions) == type({}), "Your contractions are not in a dictionary."
contraction_list = contractions.keys()
temp_data = str(data.lower()).split()
for i in range(len(temp_data)):
if temp_data[i] in contraction_list:
temp_data[i] = contractions[temp_data[i]]
data = ' '.join(temp_data)
return data
def lemmatize(data, lemmatizer):
"""
Function to lemmatize data. Best used BEFORE tokenization and AFTER normalization.
:param data: string representing review
:param lemmatizer: NLTK/Spacy lemmatizer object
:return: string with lemmas only
"""
assert type(data) == type('Document'), "Your data is not a string."
lemmas = []
sentences = sent_tokenize(data)
for s in sentences:
processed_sentence = lemmatizer(s)
lemmas.extend([word.lemma_ for word in processed_sentence])
return ' '.join(lemmas)
def clean_pos_tags(data, relevant_tags, general_category = True):
"""
Function to remove unnecessary words by filtering out based on part of speech.
Best used AFTER normalization and contraction replacement, but BEFORE other steps.
:param data: string representing review
:param relevant_tags: list of NLTK POS tags to keep
:param general_category: if True, you only need to specify general categories like JJ, NN, and it will take subcategories too.
:return: string with required words only
"""
assert type(data) == type('Document'), "Your data is not a string."
assert type(relevant_tags) == type([]), "Your POS tags are not in a list."
tagged_data = nltk.pos_tag(data.split())
# print(tagged_data)
if general_category:
kept_words = [tag_pair[0] for tag_pair in tagged_data if tag_pair[1].startswith(tuple(relevant_tags))]
else:
kept_words = [tag_pair[0] for tag_pair in tagged_data if tag_pair[1] in relevant_tags]
return ' '.join(kept_words)
def remove_stopwords(data, stopwords):
"""
Function to remove stopwords from data. Best used AFTER normalization.
:param data: string representing review
:param stopwords: list of stopwords to be removed
:return: string with stopwords removed
"""
assert type(data) == type('Document'), "Your data is not a string."
assert type(stopwords) == type([])
split_data = data.split(' ')
cleaned_data = [word for word in split_data if word not in stopwords and len(word)>3]
new_data = ' '.join(cleaned_data)
return new_data
def replace_multi_words(data, multi_dict):
"""
Function to replace common multi-words in data.
:param data: string representing review
:param multi_dict: dictionary representing mutli-words to combine.
:return: string with replaced multi-words
"""
assert type(data) == type('Document'), "Your data is not a string."
assert type(multi_dict) == type({}), "Your multi-word dictionary is not a dictionary."
new_data = str(data)
for replacement in multi_dict.keys():
if re.search(replacement, data):
new_data = re.sub(replacement, multi_dict[replacement], new_data)
return new_data
def replace_ngrams(data, ngrams_list):
"""
Function to replace ngrams in data.
:param data: string representing review
:param ngrams_list: list representing ngrams
:return: string with replaced ngrams
"""
assert type(data) == type('Document'), "Your data is not a string."
assert type(ngrams_list) == type([]), "Your list of n-grams is not a list."
new_data = str(data)
for replacement in ngrams_list:
new_data = re.sub(replacement, '_'.join(replacement.split()), new_data)
return new_data
def create_pipeline(data,
order,
keep_nonascii = False,
keep_punctuation = False,
remove_extra_spaces = True,
lowercase = True,
word_length = None,
contractions = None,
lemmatizer = None,
stopwords = None,
relevant_tags = None,
general_category = True,
multi_dict = None,
ngrams_list = None
):
"""
Function to set up a preprocessing pipeline on data. Tokenization is done at the end by default.
:param data: string representing review
:param order: list representing order of operations.
:param keep_nonascii: boolean value representing whether to keep non-ascii characters or not
:param keep_punctuation: boolean value representing whether to keep punctuation or not
:param remove_extra_spaces: boolean value representing whether to remove extra spaces between, before, and after words
:param lowercase: boolean value representing whether to lowercase data or not
:param word_length: if None, word length is not taken into account. If it is a number, only words greater than or equal to specific length will be kept.
:param contractions: dictionary of predefined contractions and their expansions
:param lemmatizer: NLTK/Spacy lemmatizer object
:param relevant_tags: list of NLTK POS tags to keep
:param general_category: if True, you only need to specify general categories like JJ, NN, and it will take subcategories too.
:param multi_dict: dictionary representing mutli-words to combine.
:param ngrams_list: list representing ngrams
*Allowed operations: 'norm'-> normalization, 'stop'->stopword removal, 'ctrc'->contraction replacement, 'lemm'->lemmatization, 'tags'->cleaning POS tags, 'mult' -> replacing multi words, 'ngrm' -> replacing ngrams.
:return: list of words after preprocessing
"""
assert type(data) == type('Document') or type(data) == type([]), "Your data is not a string or list."
assert type(order) == type([]), "Your operations order is not a list."
if type(data) == type([]):
data = ' '.join(data)
allowed_ops = ['norm', 'stop', 'ctrc', 'lemm', 'tags', 'mult', 'ngrm']
pro_data = data
for operation in order:
assert operation in allowed_ops, "Operation not in allowed list."
if operation == 'norm':
pro_data = normalize_data(pro_data, keep_nonascii, keep_punctuation,
remove_extra_spaces,
lowercase,
word_length)
# print(pro_data)
elif operation == 'stop':
assert stopwords != None, "No stopwords specified"
pro_data = remove_stopwords(pro_data, stopwords)
# print(pro_data)
elif operation == 'ctrc':
assert contractions != None, "No contractions specified"
pro_data = replace_contractions(pro_data, contractions)
#print(pro_data)
elif operation == 'lemm':
assert lemmatizer != None, "There is no lemmatizer specified"
pro_data = lemmatize(pro_data, lemmatizer)
# print(pro_data)
elif operation == 'tags':
assert relevant_tags != None, "There are no tags specified"
pro_data = clean_pos_tags(pro_data, relevant_tags, general_category)
# print(pro_data)
elif operation == 'mult':
assert multi_dict != None, "There is no multi-word dictionary specified"
pro_data = replace_multi_words(pro_data, multi_dict)
# print(pro_data)
elif operation == 'ngrm':
assert ngrams_list != None, "There are no n-grams specified"
pro_data = replace_ngrams(pro_data, ngrams_list)
return tokenize(pro_data)
def generate_ngrams(data, column, n = 2, frequency = 20, best_number = 10, min_pmi = 4, stopwords_list = stopwords.words('english')):
"""
Function to generate the most common co-occuring phrases in the data.
:param data: dataframe representing reviews list
:param column: column name to consider
:param n: integer representing the number of words to consider for an n-gram. Can take values 2 or 3.
:param frequency: integer value showing the minimum frequency of n-gram to be considered.
:param best_number: value determining how many n_grams to return
:param min_pmi: minimum pmi value to keep
:param stopwords_list: sequence of stopwords
:return: list of n-grams for the data
"""
assert type(data) == type(pd.DataFrame(columns = ['x'])), "Your data is not a dataframe."
assert type(column) == type('Document'), "Your data is not a string."
assert column in data.columns, "Invalid column name"
assert type(n) == type(int(5)), "Your n value is not an integer."
assert n in [2,3], "Incorrect value of n."
assert type(frequency) == type(int(1)), "Your frequency value is not an integer."
assert frequency > 0, "Your frequency does not fall into the appropriate range."
assert type(min_pmi) == type(int(1.0)), "Your minimum PMI is not a floating point number."
assert min_pmi > 0, "Your minimum PMI does not fall into the appropriate range."
assert type(best_number) == type(int(5)) and best_number>0, "Best number is not an integer greater than 0"
assert type(stopwords_list) in [type([]), type({'x':1}.keys())], "Stopwords are not in the right format"
final_list = []
if n == 2:
bigram_collocations = nltk.collocations.BigramAssocMeasures()
total_list = nltk.collocations.BigramCollocationFinder.from_documents([text for text in data[column]])
# print(total_list.ngram_fd.items())
if len(total_list.ngram_fd.items())>frequency:
total_list.apply_freq_filter(frequency)
scores = total_list.score_ngrams(bigram_collocations.pmi)
elif n == 3:
trigram_collocations = nltk.collocations.TrigramAssocMeasures()
total_list = nltk.collocations.TrigramCollocationFinder.from_documents([text for text in data[column]])
# print(total_list.ngram_fd.items())
if len(total_list.ngram_fd.items())>frequency:
total_list.apply_freq_filter(frequency)
scores = total_list.score_ngrams(trigram_collocations.pmi)
ngram_df = pd.DataFrame(scores, columns = ['ngram', 'pmi'])
ngram_df = ngram_df.sort_values(by = 'pmi', axis = 0, ascending = False)
print(ngram_df.head())
to_keep = []
if ngram_df.shape[0] == 0:
return [], []
for i in range(len(ngram_df)):
ngram_val = ngram_df.loc[i, 'ngram']
# print(ngram_val)
tagged_ngram = nltk.pos_tag(ngram_val)
ngram_df.loc[i, 'pos'] = str(tagged_ngram)
# print(tagged_ngram)
if n == 2:
if (tagged_ngram[0][1] not in ['NN', 'JJ', 'VBG'] and tagged_ngram[1][1] not in ['NN']) or tagged_ngram[0][0] in stopwords_list or tagged_ngram[1][0] in stopwords_list or 'n' in ngram_val or 't' in ngram_val:
to_keep.append(False)
else:
to_keep.append(True)
elif n == 3:
if (tagged_ngram[0][1] not in ['NN', 'JJ'] and tagged_ngram[2][1] not in ['NN']) or tagged_ngram[0][0] in stopwords_list or tagged_ngram[1][0] in stopwords_list or tagged_ngram[2][0] in stopwords_list or 'n' in ngram_val or 't' in ngram_val:
to_keep.append(False)
else:
to_keep.append(True)
tagged_list = ngram_df[to_keep][ngram_df['pmi'] > min_pmi]
final_list = tagged_list['ngram']
tagged_list.reset_index(inplace = True)
return final_list[:min(len(tagged_list),best_number)], tagged_list
def question_count(data, speaker_list = None, special_characters_rm = regular_expression):
assert type(data) == type([]) or type(data) == type('Document'), "Data is not a list or string"
assert type(speaker_list) == type([]) or speaker_list == None, "Speakers are not in a list"
assert type(special_characters_rm) == re.Pattern, "Your special characters are not a regex Pattern object"
assert len(data) == len(speaker_list), "Mismatch in data and speakers"
status = ''
is_valid_question = False
qa_indices = []
ans_indices = []
qa_pairs = {}
questions = []
answers = []
for turn in range(len(data)):
current_speaker = speaker_list[turn]
print("\nCurrent speaker:", current_speaker)
if 'host' in current_speaker.lower() or 'cnn' in current_speaker.lower():
status = 'questioner'
questions.append(data[turn])
qa_indices.append(turn)
else:
try:
if status == 'questioner':
qa_pairs[questions[-1]] = data[turn]
else:
qa_pairs[questions[-1]] = qa_pairs[questions[-1]] + str(data[turn])
# print(qa_pairs)
except:
# print("ISSUE")
pass
finally:
status = 'response'
answers.append(data[turn])
ans_indices.append(turn)
print("Status:", status)
# sentences = sent_tokenize(data[turn])
# for sent in sentences:
# if sent[-1] == '?' and status=='questioner' and not is_valid_question:
# is_valid_question = True
# elif (status == 'questioner' and is_valid_question) or (status == 'questioner' and turn == 0):
# is_valid_question = False
# if is_valid_question and status == 'questioner':
# print("Valid question:", sent)
# questions.append(sent)
# qa_indices.append(turn)
# break
# print(is_valid_question)
# if status == 'response' and is_valid_question:
# print("Answer:", data[turn])
# answers.append(data[turn])
# ans_indices.append(turn)
# is_valid_question = False
return questions, answers, qa_indices, ans_indices, qa_pairs
def generate_report(data):
channel = data['id'].split('-')[0]
program = data['program']
utterances = data['utt']
speakers = data['speaker']
summary = data['summary']
num_turns = turn_count(utterances)
num_words_summary = word_count(summary)
num_words_dialog = word_count(utterances)
num_speakers = speaker_count(speakers)
questions, answers, _, _ = question_count(utterances, speakers)
num_qa_pairs = len(questions)
question_lengths = [word_count(question) for question in questions]
answer_lengths = [word_count(answer) for answer in answers]
avg_q_length = 0
avg_a_length = 0
try:
avg_q_length = np.sum(np.array(question_lengths))/num_qa_pairs
avg_a_length = np.sum(np.array(answer_lengths))/num_qa_pairs
except:
print("No answers or questions")
qa_pairs = {}
if len(questions) == len(answers):
for question in range(len(questions)):
qa_pairs[questions[question]] = answers[question]
return {'channel': channel, 'program': program, 'turns':num_turns, 'words_summary':num_words_summary, 'words_dialog':num_words_dialog, 'speakers':num_speakers,
'avg_q':avg_q_length, 'avg_a':avg_a_length, 'num_q':num_qa_pairs}, qa_pairs