forked from hortonworks/hive-testbench
-
Notifications
You must be signed in to change notification settings - Fork 2
/
parselog.py
87 lines (73 loc) · 2.85 KB
/
parselog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import time, csv, re, os, datetime
""" Log location """
LOG_FOLDER = "log_query/"
BASE_LOG_NAME = "logquery"
LOG_EXT = ".txt"
""" BASE PARAMS """
os.environ["TZ"]="US/Pacific"
time_id = datetime.datetime.now().strftime("%m.%d.%Y-%H.%M")
OUT_NAME = "llapio_summary" + time_id + ".csv"
def getCacheHitRatio(path):
""" Returns the cache hit ratio """
cacheHit, miss, total = 0, 0, 0
with open(path, "r") as file:
for line in file:
if "CACHE_HIT_BYTES" in line:
cacheHit += [int(item) for item in line.split() if item.isdigit()][0]
elif "CACHE_MISS_BYTES" in line:
miss += [int(item) for item in line.split() if item.isdigit()][0]
total = cacheHit + miss
if total != 0:
return cacheHit / total * 100
else:
# query fail
return 0.12345
def getMetadataHitRatio(path):
""" Returns the metadata hit ratio. 'Cache retention rate basically' """
metadataHit, miss, total = 0, 0, 0
with open(path, "r") as file:
for line in file:
if "METADATA_CACHE_HIT" in line:
metadataHit += [int(item) for item in line.split() if item.isdigit()][0]
elif "METADATA_CACHE_MISS" in line:
miss += [int(item) for item in line.split() if item.isdigit()][0]
total = metadataHit + miss
if total != 0:
return metadataHit / total * 100
else:
# query fail
return 0.12345
def write_csv(cacheHitRatios, metadataHitRatio):
"""
Writes info to a csv file.
Modify by adding new columns and map of parsed data.
"""
queryNum = list(cacheHitRatios.keys())
queryNum.sort(key=float)
with open(OUT_NAME, "w", newline="") as output_csv:
writer = csv.writer(output_csv)
# header
head = ["Query#", "Cache Hit %", "Metadata Hit %"]
writer.writerow(head)
# info
for i in queryNum:
writer.writerow([float(i), cacheHitRatios[i], metadataHitRatio[i]])
def main():
querynum_to_cacheratio = {}
querynum_to_metadatahitratio = {}
for filename in os.listdir(LOG_FOLDER):
if filename.startswith(BASE_LOG_NAME) and filename.endswith(LOG_EXT):
query_runNum = re.findall("\d+\.\d+", filename)
if len(query_runNum) == 1:
query_num = query_runNum[0]
filepath = LOG_FOLDER + filename
querynum_to_cacheratio[query_num] = getCacheHitRatio(filepath)
querynum_to_metadatahitratio[query_num] = getMetadataHitRatio(filepath)
else:
raise Exception("Did not find query number in " + query_runNum)
write_csv(querynum_to_cacheratio, querynum_to_metadatahitratio)
if __name__ == "__main__":
start = time.time()
main()
end = time.time()
print("Log parsing finished in {0} secs".format(end - start))