-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspam_probability.py
executable file
·46 lines (28 loc) · 1.1 KB
/
spam_probability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/python
import json
json_data = open('spam_dict.txt')
x = json.load(json_data)
json_data.close()
json_data = open('more_spam_dict.txt')
y = json.load(json_data)
json_data.close()
data = dict(x.items()+y.items()) #if same values occur in both, the values of x will be overriden by y value
print 'The size of data dict is {0}'.format(len(data))
"""
its time we calculate the probability of each token.
As in how much does each token contribute to any spam email
Ham probability = Token frequency in ham messages / Number of ham messages trained on
Spam probability = Token frequency in spam messages / Number of spam messages trained on
If either Ham probability or Spam probability are greater than 1.0, set them equal to 1.0.
"""
Spam_Token = {} #stores the contribution of each token to any spam email
for tokens,value in data.iteritems():
prob = value/1780.0 #tested with 502 emails
if prob>=1.0:
prob = 0.999
Spam_Token[tokens]=prob
for i,j in Spam_Token.iteritems():
if j>0.2:
print i,':',j
with open('spam_token_contribution.db','w') as outfile:
json.dump(Spam_Token,outfile)