-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbody_extractor.py
executable file
·83 lines (69 loc) · 1.83 KB
/
body_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/python
import mailbox
import os
import email
import re
from BeautifulSoup import BeautifulSoup
import json
import sys
p = email.Parser.Parser()
word_count = {}
reg = re.compile('[a-z]+',re.IGNORECASE)
reg2 = re.compile('\W+') #getting all the special characters
counter = 0
directory = sys.argv[1]
path ='./'+directory+'/'
print path
for i in os.listdir(path):
fp = open(path+i,'rb')
msg = p.parse(fp)
fp.close()
text = msg.get_payload() #get the pay_load
try:
text = text.lower()
except AttributeError:
flag = 0
for check in msg.get_payload(): #For all the mails that are base64 encoded ignore them
q = check['Content-Type']
if check['Content-Transfer-Encoding']=='base64':
print 'base64 :',i
flag = 1
break
if flag ==1 :
continue
text = ' '
for ex in msg.get_payload():
try:
text = text+ex.get_payload()
except TypeError:
counter+=1
print i,':',counter #check how many emails cannot be read
continue
text = text.lower()
try:
tokens = ' '.join(BeautifulSoup(text).text.split())
# temp_special = ' '.join(reg2.findall(tokens))
# special_char = temp_special.split()
tokens = reg.findall(tokens)# + special_char
except TypeError:
print 'Some shit happened in finding the regex'
continue
for words in tokens:
if words not in word_count:
word_count[words]=1
else:
word_count[words]+=1 # counting their occurences
for i,j in word_count.iteritems():
if j>50:
print i,j
if directory=='spam':
with open('spam_dict.txt','w') as outfile:
json.dump(word_count,outfile)
elif directory == 'easy_ham':
with open('ham_dict.txt','w') as outfile:
json.dump(word_count,outfile)
elif directory == 'spam_2':
with open('more_spam_dict.txt','w') as outfile:
json.dump(word_count,outfile)
#Bug : throw all the extra HTML tags
# convert all the words to lowercase