-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommitMsgsAnalysis.py
126 lines (92 loc) · 3.43 KB
/
commitMsgsAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 1 06:04:31 2018
@author: gaurav
"""
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from collections import defaultdict
from nltk.probability import FreqDist
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.stem.lancaster import LancasterStemmer
import operator
import xlsxwriter
def tokenizeText(text):
return sent_tokenize(text)
def tokenizeSentences(text, app):
word_sent = word_tokenize(text.lower())
_stopwords = set(stopwords.words('english') + list(punctuation) + ['merge', app, 'pull', 'request', 'branch', 'master', 'ddcnls', 'ddcnls', '/'])
word_sent=[w for w in word_sent if not w in _stopwords and not 'hmrc' in w and not 'ddcn' in w]
return word_sent
def findSortedBigrams(words):
finder = BigramCollocationFinder.from_words(words)
return finder.ngram_fd.items()
def findSortedTrigrams(words):
finder = TrigramCollocationFinder.from_words(words)
return finder.ngram_fd.items()
def findStemWordsFreequency(words):
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in words]
return FreqDist(stemmedWords)
def findMostUsed(col):
return sorted(dict(col).items(), key=operator.itemgetter(1), reverse=True)[:5]
def findTotalMerges(msgs):
c = 0
for i in msgs:
if('merge pull request' in i.lower()):
c = c+1
return c
def appStats(app):
res = None
try:
res = pd.read_excel('commits/msgs/' + app + '.xlsx', 'Sheet1')
except:
res = None
if(res is None):
print('no data for ' + app)
return
table = res.groupby(['month']).agg({'name':len})
totalMerges = findTotalMerges(res['comments'])
totalCommits = len(res)
(bi, tri) = appCommentsAnalysis(app, res)
return (app, table, totalMerges, totalCommits, bi, tri)
def writeToExcel(app, table, merges, commits, bi, tri):
workbook = xlsxwriter.Workbook('commits/' + app + 'stats.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, 'app')
worksheet.write(0, 1, 'Jan')
worksheet.write(0, 2, 'Feb')
worksheet.write(0, 3, 'Mar')
worksheet.write(0, 4, 'Apr')
worksheet.write(0, 5, 'Merges')
worksheet.write(0, 6, 'Commits')
worksheet.write(0, 7, 'Tri')
worksheet.write(1, 0, 'app')
for rec in table.itertuples():
month, count = rec
worksheet.write(1, month, count)
worksheet.write(1, 5, merges)
worksheet.write(1, 6, commits)
worksheet.write(1, 7, tri)
workbook.close()
def joinListItems(l):
mapFirstEle = map(lambda x: x[0], l)
newList = list(mapFirstEle)
return ''.join(str(newList))
def appCommentsAnalysis(app, res):
msgs = ''
for index, d, n, c, m in res.itertuples():
msgs += c
word_sent = tokenizeSentences(msgs, app)
bigram = findSortedBigrams(word_sent)
trigram = findSortedTrigrams(word_sent)
mostUsedBigram = findMostUsed(bigram)
mostUsedTrigram = findMostUsed(trigram)
return (joinListItems(mostUsedBigram), joinListItems(mostUsedTrigram))
'''res.groupby(['date']).agg({'name':len})'''
'''res['date'].apply(lambda x: pd.to_datetime(x).month)'''