-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
107 lines (92 loc) · 3.62 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
import sys
from utils import write_status
from nltk.stem.porter import PorterStemmer
def preprocess_word(word):
# Remove punctuation
word = word.strip('\'"?!,.():;')
# Convert more than 2 letter repetitions to 2 letter
# funnnnny --> funny
word = re.sub(r'(.)\1+', r'\1\1', word)
# Remove - & '
word = re.sub(r'(-|\')', '', word)
return word
def is_valid_word(word):
# Check if word begins with an alphabet
return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)
def handle_emojis(tweet):
# Smile -- :), : ), :-), (:, ( :, (-:, :')
tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
# Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
# Love -- <3, :*
tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
# Wink -- ;-), ;), ;-D, ;D, (;, (-;
tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
# Sad -- :-(, : (, :(, ):, )-:
tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
# Cry -- :,(, :'(, :"(
tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
return tweet
def preprocess_tweet(tweet):
processed_tweet = []
# Convert to lower case
tweet = tweet.lower()
# Replaces URLs with the word URL
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
# Replace @handle with the word USER_MENTION
tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
# Replaces #hashtag with hashtag
tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
# Remove RT (retweet)
tweet = re.sub(r'\brt\b', '', tweet)
# Replace 2+ dots with space
tweet = re.sub(r'\.{2,}', ' ', tweet)
# Strip space, " and ' from tweet
tweet = tweet.strip(' "\'')
# Replace emojis with either EMO_POS or EMO_NEG
tweet = handle_emojis(tweet)
# Replace multiple spaces with a single space
tweet = re.sub(r'\s+', ' ', tweet)
words = tweet.split()
for word in words:
word = preprocess_word(word)
if is_valid_word(word):
if use_stemmer:
word = str(porter_stemmer.stem(word))
processed_tweet.append(word)
return ' '.join(processed_tweet)
def preprocess_csv(csv_file_name, processed_file_name, test_file=False):
save_to_file = open(processed_file_name, 'w')
with open(csv_file_name, 'r') as csv:
lines = csv.readlines()
total = len(lines)
for i, line in enumerate(lines):
tweet_id = line[:line.find(',')]
if not test_file:
line = line[1 + line.find(','):]
positive = int(line[:line.find(',')])
line = line[1 + line.find(','):]
tweet = line
processed_tweet = preprocess_tweet(tweet)
if not test_file:
save_to_file.write('%s,%d,%s\n' %
(tweet_id, positive, processed_tweet))
else:
save_to_file.write('%s,%s\n' %
(tweet_id, processed_tweet))
write_status(i + 1, total)
save_to_file.close()
print '\nSaved processed tweets to: %s' % processed_file_name
return processed_file_name
if __name__ == '__main__':
if len(sys.argv) != 2:
print 'Usage: python preprocess.py <raw-CSV>'
exit()
use_stemmer = False
csv_file_name = sys.argv[1]
processed_file_name = sys.argv[1][:-4] + '-processed.csv'
if use_stemmer:
porter_stemmer = PorterStemmer()
processed_file_name = sys.argv[1][:-4] + '-processed-stemmed.csv'
preprocess_csv(csv_file_name, processed_file_name, test_file=False)