-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
63 lines (50 loc) · 1.94 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
def review_to_list(raw_review, remove_stopwords):
""" convert a raw review to a list of words"""
# Remove HTML
review_text = BeautifulSoup(raw_review, "html.parser").get_text()
# Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
words = letters_only.lower().split()
if remove_stopwords:
# Get rid of stop words
stops = set(stopwords.words("english"))
words = [w for w in words if w not in stops]
return words
def build_dictionary(review_list):
"""Assign an id to every new words and store them in dictionary
Use dictionary for getting ID from word and reverse_dictionary for
getting word from ID
"""
dictionary = dict()
reverse_dictionary = dict()
for review in review_list:
for word in review:
if word not in dictionary:
dictionary[word] = len(dictionary)
for key, value in dictionary.items():
reverse_dictionary[value] = key
return dictionary, reverse_dictionary
def build_data(dictionary, review_list):
""" Convert a list of word to a list of words IDs"""
data = list()
for review in review_list:
review_as_num = []
for word in review:
word_index = dictionary[word]
review_as_num.append(word_index)
data.append(review_as_num)
return data
def get_data(remove_stopwords=False):
""" read data from disk and preprocess them"""
reviews = []
train = pd.read_csv("data/labeledTrainData.tsv", header=0,
delimiter="\t", quoting=3)
for i in range(train["review"].size):
reviews.append(review_to_list(train["review"][i], remove_stopwords))
dictionary, reverse_dictionary = build_dictionary(reviews)
data = build_data(dictionary, reviews)
return data, train["sentiment"], dictionary, reverse_dictionary