-
Notifications
You must be signed in to change notification settings - Fork 1
/
genarate_conditional_probability_clothing.py
76 lines (65 loc) · 2.71 KB
/
genarate_conditional_probability_clothing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gzip
import json
from nltk.stem import PorterStemmer
porter = PorterStemmer()
import re
import time
import json
descriptions_list = []
descriptions_stemmed = []
file_read = {}
hash_map_stemmed_words = {}
with open('description_clothing_1.json','r') as file:
file_read = json.loads(file.read())
for each_line in file_read.values()[0]:
descriptions_list.extend(each_line.split('.')[:-1])
for each_description in descriptions_list:
sentence_stemmed = []
if each_description:
for each_word in each_description.split(' '):
try:
if len(each_word) > 0:
each_word = re.sub('[^A-Za-z0-9]+','',each_word)
sentence_stemmed.append(porter.stem(each_word))
if each_word:
if not hash_map_stemmed_words.get(porter.stem(each_word)):
hash_map_stemmed_words[porter.stem(each_word)] = [each_word]
else:
hash_map_stemmed_words[porter.stem(each_word)].append(each_word)
except Exception,e:
print e,each_word
descriptions_stemmed.append(sentence_stemmed)
bigram_hash = {}
token_frequency = {}
for each_description in descriptions_stemmed:
try:
for each_word in range(0,len(each_description)-1):
if not bigram_hash.get((each_description[each_word],each_description[each_word+1])) and each_description[each_word] and each_description[each_word+1]:
bigram_hash[(each_description[each_word],each_description[each_word+1])] = 1
elif bigram_hash.get((each_description[each_word],each_description[each_word+1])) and each_description[each_word] and each_description[each_word+1]:
bigram_hash[(each_description[each_word],each_description[each_word+1])] += 1
for each_word in range(0,len(each_description)):
if not token_frequency.get(each_description[each_word]) and each_description[each_word]:
token_frequency[each_description[each_word]] = 1
elif token_frequency.get(each_description[each_word]) and each_description[each_word]:
token_frequency[each_description[each_word]] += 1
except Exception,e:
print e
attribute_hash = {}
for attribute in bigram_hash.keys():
if not attribute_hash.get(attribute[1]):
attribute_hash[attribute[1]] = [attribute[0]]
else:
attribute_hash[attribute[1]].append(attribute[0])
for attribute in attribute_hash.keys():
list_tupple = ()
for value in attribute_hash[attribute]:
co_occurence = bigram_hash[(value,attribute)]
attribute_frequency = token_frequency[attribute]
list_tupple = list_tupple + (value,float(co_occurence)/float(attribute_frequency),)
attribute_hash[attribute] = list_tupple
file_open = open('attributes_stemmed_non_stemmed_hash_map.json','w')
json.dump(hash_map_stemmed_words,file_open)
file_open.close()
# with open('conditional_probability_distribuition_clothing.json','w') as file:
# json.dump(attribute_hash,file)