-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbusiness_sim_3cixty.py
57 lines (44 loc) · 2.23 KB
/
business_sim_3cixty.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol
from itertools import permutations
import json
import math
import re
class BusinessSim(MRJob):
INPUT_PROTOCOL = JSONValueProtocol
# go through each review - if the business category is in businesses dict, yield the business_id and the word.
def get_words(self, _, record):
#Turn the string or review words into a list
words = list(record['review']['value'])
bis_id = re.split('\/',record['id']['value'])
bisunid = bis_id[-1]
#Iterate through each word in the review
for word in words:
yield bisunid, word
# create word set for each business_id
def get_word_set(self, business_id, words):
words_list = set(words)
yield business_id, list(words_list)
# combine both inputs into output value
def combine_review_words(self, business_id, review_words):
yield 'foo', [business_id, review_words]
# compare word lists for each business, calculate jaccard coefficient
def compare_businesses(self, _, business_reviews):
for biz1, biz2 in permutations(business_reviews, 2):
all_words = (set(biz1[1] + biz2[1]))
jaccard_denom = len(all_words)
common_words = (set(biz1[1]) & set(biz2[1]))
jaccard_numer = len(common_words)
jaccard_coefficient = float(jaccard_numer) / jaccard_denom
yield [biz1[0], biz1[1]], [biz2[0], jaccard_coefficient]
# make a comprehensive list of businesses with business_id, word sets, and jaccard relationships with each other business_id
def make_business_list(self, biz1, biz2s):
biz2s = list(biz2s)
biz1 = list(biz1)
yield biz1, biz2s
def steps(self):
return [self.mr(mapper=self.get_words, reducer=self.get_word_set),
self.mr(mapper=self.combine_review_words, reducer=self.compare_businesses),
self.mr(reducer=self.make_business_list)]
if __name__ == '__main__':
BusinessSim.run()