-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparsenew.tar
112 lines (89 loc) · 10 KB
/
parsenew.tar
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
parselogs.py 0000644 0000765 0000024 00000001340 11713027755 012605 0 ustar fuming staff from parse_tools import *
import sys
from log_util import *
#key: (ip,country), value: frequency
ip_country_dict = {}
#print sys.argv
#f = open(sys.argv[1],'r')
'''
The plan is to use DirWalker to yield line of log in each file
'''
w = LogWalker(sys.argv[1], 'db33_34_swdf')
# print "******"
# print line
# print "******"
for log in w:
print log
(haship,country) = log['hashIP'],log['country']
if ip_country_dict.has_key((haship, country)):
ip_country_dict[(haship,country)] += 1
else:
ip_country_dict[(haship,country)] = 1
print 'total:',sum(ip_country_dict.values())
for k,v in ip_country_dict.items():
print k, v
#for k,v in fields.items():
#print k,":",v
log_util.py 0000644 0000765 0000024 00000004532 11713033663 012425 0 ustar fuming staff import os
from parse_tools import *
DB33_34_SWDF = 'db33_34_swdf'
DB36 = 'db36'
BIORDF = 'biordf'
class Log(object):
def __init__(self, raw_text, folder, parse_syntax):
self.folder = folder
self.raw_text = raw_text
self.parse_syntax = parse_syntax
self.load()
def to_dict(self):
return self.ret
def load(self):
log = self.raw_text
if self.parse_syntax == DB33_34_SWDF:
self.ret = getLogLineBNF_DBpedia33().parseString(log)
elif self.parse_syntax == DB36:
self.ret = getLogLineBNF_DBpedia36().parseString(log)
class DirWalker(object):
def __init__(self, root,syntax):
self.root = root
self.curdir = None
self.skipped = 0
self.parsed = 0
self.syntax = syntax
def __iter__(self):
class DirIter():
def __init__(self, walker):
self.w = walker
self.g = self.generator()
def generator(self):
for root, dir, files in os.walk(self.w.root):
folder = os.path.basename(root)
for fname in files:
print 'now process fname:',fname
try:
with file(os.path.join(root, fname)) as f:
for line in f:
yield Log(line, folder,self.w.syntax).to_dict() # very important bug....self.w.root not self.root
self.w.parsed += 1
except Exception as e:
self.w.skipped += 1
pass
def next(self):
for retval in self.g:
return retval
raise StopIteration
return DirIter(self)
if __name__ == '__main__':
import sys
w = DirWalker(sys.argv[1],DB33_34_SWDF)
ip_country_dict = {}
for fields in w:
(haship,country) = fields['hashIP'],fields['country']
if ip_country_dict.has_key((haship, country)):
ip_country_dict[(haship,country)] += 1
else:
ip_country_dict[(haship,country)] = 1
print 'total:',sum(ip_country_dict.values())
for k,v in ip_country_dict.items():
print k, v