-
Notifications
You must be signed in to change notification settings - Fork 5
/
preprocess.py
95 lines (68 loc) · 2.58 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
# coding=utf-8
"""
Preprocess of your Yelp Datatset
"""
import json
import pandas as pd
import re
import os
# Input and Output path
PATH = os.getcwd() + '/yelp_dataset/'
OUT_FPATH = os.getcwd() + '/yelp_dataset/preprocess/'
if not os.path.exists(OUT_FPATH):
os.mkdir(OUT_FPATH)
# Review file is large. so choose max number of output files.
MAX_REVIEW_NUM = 500000
def preprocess_business():
print('preprocessing business file.')
business_path = PATH + 'yelp_academic_dataset_business.json'
df = pd.read_json(business_path, lines=True)
# only keep necessary columns
keeps = ['name', 'categories', 'business_id']
df = df[keeps]
# rename a column
df.rename(columns={'name' : 'business_name'}, inplace=True)
# keep only restaurant
df = df.dropna(subset=['categories'])
df = df[df['categories'].str.contains('Restaurants')]
print('business len:', len(df))
print('business preprocess done!')
return df
def preprocess_review():
print('preprocessing review file. It can take more than a few minutes.')
review_path = PATH + 'yelp_academic_dataset_review.json'
df = pd.read_json(review_path, lines=True)
# only keep necessary columns
keeps = ['business_id', 'stars', 'text']
df = df[keeps]
# clean text
def clean_text(text):
text = re.sub('[-=+#/\?:^@*\"※~&%ㆍ』\\‘|\(\)\[\]\<\>`\…》]', '', text)
text = re.sub('\n', '', text)
return text
df.text = df.text.apply(clean_text)
print('review len:', len(df))
print('review preprocess done!')
return df
def preprocess(business_df, review_df):
print('preprocessing')
# merge review and restaurant
df = business_df.merge(review_df, on='business_id')
positive = df[df['stars'] > 3].text
neutral = df[df['stars'] == 3].text
negative = df[df['stars'] < 3].text
# write to txt
positive[:MAX_REVIEW_NUM].to_csv(OUT_FPATH + 'positive.txt', encoding='utf-8', header=None, index=None, sep=' ')
neutral[:MAX_REVIEW_NUM].to_csv(OUT_FPATH + 'neutral.txt', encoding='utf-8', header=None, index=None, sep=' ')
negative[:MAX_REVIEW_NUM].to_csv(OUT_FPATH + 'negative.txt', encoding='utf-8', header=None, index=None, sep=' ')
print('pos text file:', len(positive[:MAX_REVIEW_NUM]))
print('neu text file:', len(neutral[:MAX_REVIEW_NUM]))
print('neg text file:', len(negative[:MAX_REVIEW_NUM]))
print('preprocess done!')
def main():
business = preprocess_business()
review = preprocess_review()
preprocess(business, review)
if __name__ == "__main__":
main()