-
Notifications
You must be signed in to change notification settings - Fork 0
/
ParseLogs.py
144 lines (111 loc) · 6.31 KB
/
ParseLogs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from urllib.request import urlopen
import os
import re
import datetime
import json
# Constant Vars
url = "https://s3.amazonaws.com/tcmg476/http_access_log"
fileName = "http.log"
monthName = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'} # Maps month num (key) to name (value)
# Downloads http log to "http.log"
def getDataFile():
with open(fileName, 'wb') as logFile: # creates a new http.log file
with urlopen(url) as stream: # connect to server
fileSize = stream.length
# https://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python
print("Downloading \"%s\" (%s KB)..." % (fileName, fileSize / 1000))
currentFileSize = 0
blockSize = 8192
while True: # loop through download (blockSize at a time), and write bytes to file
buffer = stream.read(blockSize)
if not buffer: # if at end of file
break
currentFileSize += len(buffer) # track how many bytes downloaded so far
logFile.write(buffer)
status = r"%10d [%3.2f%%]" % (currentFileSize, currentFileSize*100. / fileSize) # displays percentage downloaded
status = status + chr(8)*(len(status) + 1)
print(status, end="") # prints without appended "\n"
print("", end="\n") # reset print appended char
# Sorts logs by month, and then by day in the "data" dictionary.
def parseLogs(data):
with open(fileName, 'r') as logFile: #opens http.log file
print("Parsing Data File...")
monthNum = {v: k for k, v in monthName.items()} # maps Month name (key) to num (val), generated by inverting the key/val pairs of monthInt
currline = 0
badParses = [] # list of all failed parses
regex = re.compile('.*\[(.*?):.*\] \".* (.*) .*\" (\d{3})')
for line in logFile: # iterate through entire log file
currline += 1
splitData = regex.split(line)
if len(splitData) == 5: # If regex worked:
dateSplit = splitData[1].split('/') # splits up day/month/year string
date = datetime.date(int(dateSplit[2]), monthNum[dateSplit[1]], int(dateSplit[0])) # create date object (year, month, day)
logData = {'date': date, 'name':splitData[2], 'code':int(splitData[3])} # store each log as a dict #TODO: Add key for all data
if date.day in data[date.month]: # if logs list has already been created for that day
data[date.month][date.day].append(logData) # append dictionary containing log data
else:
data[date.month][date.day] = [logData] # otherwise add to month dictionary, key = day, value = logData
else: # If regex didn't work:
badParses.append(splitData) # add to list of failures
print("\nTotal Requests: " + str(currline))
print("\n(" + str(len(badParses)) + " lines couldn't be parsed)") #TODO: save bad parses to file
def countEvents(month):
sum = 0
for dayNum, logs in month.items():
sum += len(logs)
return sum
def main():
data = {x:{} for x in range(1,13)} # generates a dictionary containing 12 empty dictionaries (one for each month of data),
# key = monthNum, value = dictionary of events on each day
if not os.path.exists(fileName): # check if file exists before re-downloading
print("No cached " + fileName + " found.\nDownloading from: " + url)
getDataFile() # Saves file as http.log
else:
print("Using cached " + fileName + " file.")
parseLogs(data) # parses data file, and sorts by month and day
print("\nEvents Per Month/Day/Week:")
successCode = 0
errorCode = 0
elsewhereCode = 0
fileNames = {} # tracks how many times each file name was referenced
weeklyLogs = {}
# Main loop - goes through data dictionary, keeping track of stats
for monthNum, month in data.items(): # for each dictionary in data
print("\n" + monthName[monthNum] + ": [" +str(countEvents(month)) + " total events]") # prints name of month & how many events occurred
for dayNum, logs in month.items(): # iterate through each day of logs
print("\t" + str(dayNum) + " - " + str(len(logs)) + " events")
for log in logs: # iterate through each log dictionary contained in the logs list
# track http codes
logCode=log['code']
if logCode <= 299:
successCode+=1
elif 300 <= logCode <= 399:
elsewhereCode+=1
else: #logCode >= 400
errorCode+=1
# track file names
if log["name"] in fileNames:
fileNames[log["name"]] += 1
else:
fileNames[log["name"]] = 1
# track logs per week
if log["date"].isocalendar()[1] in weeklyLogs:
weeklyLogs[log["date"].isocalendar()[1]] += 1
else:
weeklyLogs[log["date"].isocalendar()[1]] = 1
sorted_weeklyLogs = sorted(weeklyLogs.items(), key=lambda x: x[0])
print("\nRequests per week: ")
for weekNum, count in sorted_weeklyLogs:
print("\t" + str(weekNum) + " - " + str(count) + " events")
total_codes = float(successCode + errorCode + elsewhereCode)
print("\nPercentage failure (4xx): {0:.4g} %".format(((errorCode / total_codes) * 100.0)))
print("Percentage redirected (3xx): {0:.4g} %".format(((elsewhereCode / total_codes) * 100.0)))
sorted_fileNames = sorted(fileNames.items(), key=lambda x: x[1]) # Sort fileNames dict by file count
print("\nMost requested file: " + sorted_fileNames[-1][0] + " (accessed " + str(sorted_fileNames[-1][1]) + " times)")
print("Least requested file: " + sorted_fileNames[0][0] + " (accessed " + str(sorted_fileNames[0][1]) + " time)")
print("\nCreating .json files...")
for monthNum in range(1,13):
with open(monthName[monthNum] + ".json", 'w') as outfile:
json.dump(data[monthNum], outfile, default=str)
if __name__ == "__main__":
main()