-
Notifications
You must be signed in to change notification settings - Fork 2
/
git_extract.rb
333 lines (297 loc) · 8.44 KB
/
git_extract.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#! /usr/bin/ruby
# add the directory of the script file to the load path so
# ruby will find trollop
$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
require 'set'
require 'rubygems'
require 'sequel'
require 'open4'
require 'time'
require 'trollop'
class String
def to_proc
proc { |*args| send(self, *args) }
end
end
class Symbol
def to_proc
proc { |obj, *args| obj.send(self, *args) }
end
end
class Timer
def initialize
@last = Time.now
end
def last
@last
end
def delta
last = @last
@last = Time.now
return @last - last
end
def log
puts delta
end
end
$timer = Timer.new
def setup_table(name, db, reset, &block)
db.drop_table name if db.table_exists? name and reset
if not db.table_exists? name
db.create_table name, &block
end
end
def setup_tables(db, reset)
setup_table :git_repo, db, reset do
String :commit
String :repo
primary_key( :commit, :repo)
index :commit
end
setup_table :git_commit, db, reset do
String :commit
String :tree
String :author
DateTime :author_dt, :default => '1980-1-1'
String :author_id
String :committer
DateTime :committer_dt, :default => '1980-1-1'
String :committer_id
String :subject, :default => ''
Integer :num_children, :default => 0
Integer :num_parents, :default => 0
String :log, :default => ''
primary_key(:commit)
end
setup_table :git_dag, db, reset do
String :child, :null => false
String :parent, :null => false
index :parent
index :child
end
setup_table :git_revision, db, reset do
String :commit, :null => false
Integer :add
Integer :remove
String :path, :null => false
primary_key(:commit, :path)
index(:commit)
index(:path)
end
setup_table :git_refs_tags, db, reset do
String :commit, :null => false
String :path, :null => false
primary_key(:commit, :path)
end
setup_table :git_chain, db, reset do
String :commit, :null => false
String :name_addr
Integer :indiv_id
String :type # s = Signed, a = Acked, t = Tested, r = Reviewed, c = Cc'ed
primary_key :commit
index :commit
end
end
$git_log_cmd = "git log --full-history --all --date=iso --numstat -M -C --pretty=format:\"" +
"__START_GIT_COMMIT_LOG_MSG__%n%H%n%T%n%an <%ae>%n%ai%n%cn <%ce>" +
"%n%ci%n%P%n%d%n%s%n%b%n__END_GIT_COMMIT_LOG_MSG__\""
def get_git_log_lines(repo_path)
#run the log command
git_log_cmd = "cd #{repo_path} && #{$git_log_cmd}"
puts "Getting git log text"
puts git_log_cmd
error_txt = ""
history = []
status = Open4::popen4(git_log_cmd) do |pid, stdin, stdout, stderr|
history = stdout.readlines
error_txt = stderr.read
end
#if there was an error message, then print it out and die
if status.exitstatus != 0
puts "exit status is : #{status.exitstatus}"
puts error_txt
exit(1)
end
puts "Parsing git log"
return history.map!( &:strip )
end
def get_file_log_lines(file)
return open(file).readlines.map( &:strip )
end
# should change this to
def sliced_multi_insert(db, table, schema, data)
if db.uri =~/^mysql/
0.upto(data.length/50) do |i|
db[table].multi_insert(schema, data[i*50...(i+1)*50])
end
else
db[table].multi_insert(schema, data)
end
end
class Sequel::Dataset
public
def sliced_multi_insert(schema, data)
if @db.uri =~ /^mysql/
0.upto(data.length/50) do |i|
multi_insert(schema, data[i*50...(i+1)*50])
end
else
row_hashes = []
data.each do |row|
row_hash = {}
col_index = 0
schema.each do |col|
row_hash[col] = row[col_index]
col_index += 1
end
row_hashes << row_hash
end
multi_insert(row_hashes)
end
end
public_class_method :sliced_multi_insert
end
def parse_log(repo_name, history, db)
print "**** parsing history ****\n"
i = 0
parsed_hashes = 0
#get all commits that are already in the db
commits = Set.new
db[:git_commit].each { |commit| commits.add(commit) }
commit_data = []
repo_data = []
refs_tags_data = []
dag_data = []
revision_data = []
loop do
i += 1 while i < history.length and not history[i] =~ /^__START_GIT_COMMIT_LOG_MSG__/
break if i >= history.length
i += 1
s = i
commit_id = history[i]
#we ALWAYS have to insert the repo and the refs
repo_data << [repo_name, commit_id]
i += 1
refs_line = history[s+7]
if not refs_line.empty?
refs_line.gsub!(/^\s*\(/, "")
refs_line.gsub!(/\)\s*$/, "")
#continue from here
refs_line.split(", ").each { |ref|
refs_tags_data << [commit_id, ref]
}
end
parsed_hashes += 1
if parsed_hashes % 1000 == 0
puts "parsed #{parsed_hashes} commits\n"
puts "took #{$timer.delta} seconds\n"
end
#the sha may already be in the database so check that
next if commits.include? commit_id
log = ""
i = s + 8
while i < history.length and not history[i] =~ /^__END_GIT_COMMIT_LOG_MSG__/
log << history[i] + "\\n"
i += 1
end
tree = history[s+1]
author = history[s+2]
author_dt = history[s+3]
committer = history[s+4]
committer_dt = history[s+5]
commit_data << [commit_id, tree, author, author_dt, committer, committer_dt, log]
# line 6 contains the parents, add those
history[s+6].split(/\s+/).each do |parent|
dag_data << [commit_id, parent]
end
#move past the __END_GIT_COMMIT_LOG_MSG__
i += 1
# lines after this are the files that were changed and counts of lines changed
while i < history.length and not history[i] =~ /__START_GIT_COMMIT_LOG_MSG__/
if history[i] =~ /^(\d+)\s+(\d+)\s+(.*)/
# (lines added)\s+(lines removed)\s+(path)
revision_data << [commit_id, $1.to_i, $2.to_i, $3]
elsif history[i] =~ /^\-\s+\-\s+(.*)/
revision_data << [commit_id, nil, nil, $1]
end
i += 1
end
end
history.clear
$timer.log
puts "inserting #{revision_data.length} revision records"
# mysql will sometimes time-out so try to reconnect just in case
db[:git_revision].sliced_multi_insert([:commit, :add, :remove, :path], revision_data)
$timer.log
puts "inserting #{dag_data.length} dag records"
db[:git_dag].sliced_multi_insert([:child, :parent], dag_data)
$timer.log
puts "inserting #{commit_data.length} commit records"
db[:git_commit].sliced_multi_insert( [:commit, :tree, :author, :author_dt,
:committer, :committer_dt, :log], commit_data)
$timer.log
puts "inserting #{refs_tags_data.length} ref/tag records"
db[:git_refs_tags].sliced_multi_insert([:commit, :path], refs_tags_data)
$timer.log
puts "inserting #{repo_data.length} repo records"
db[:git_repo].sliced_multi_insert([:repo, :commit], repo_data)
$timer.log
end
#do some post processing and fill in the tables
def update_relations(db)
if db.uri =~ /^mysql/
puts "gathering number of parents"
db << "update git_commit set num_parents = (select count(*) as parents from
git_dag where commit = child)"
$timer.log
puts "gathering number of children"
db << "update git_commit set num_children = (select count(*) as children from
git_dag)"
$timer.log
elsif db.uri =~ /^postgres/
puts "gathering number of parents"
db << "update git_commit set num_parents = r.parents from (select child, count(*) as parents from
git_dag group by child) as r where r.child = git_commit.commit"
$timer.log
puts "gathering number of children"
db << "update git_commit set num_children = r.children from (select parent, count(*) as children from
git_dag group by parent) as r where r.parent = git_commit.commit"
$timer.log
end
end
def main
opts = Trollop::options do
version "git_extract.rb version 0.1"
banner <<-EOS
git_extract.rb will mine information from git and put it into
a database. You must specify ONE of: repo location with -r or
log file with -l. The log file must be the result of running
(all on one line, no line breaks):
#{$git_log_cmd}
EOS
opt :repo, "path to git repository", :short => "r", :type => String
opt :log, "log file for git repository", :short => "l", :type => String
conflicts :log, :repo
opt :dburl, "url for database example: postgres://cabird:passwd@localhost/git_db",
:short => "d", :required => true, :type => String
opt :name, "repo name to be stored in database", :short => "n", :required => true,
:type => String
opt :reset, "clear contents of tables before inserting"
end
#p opts
db = Sequel.connect(opts[:dburl])
setup_tables(db, opts[:reset])
puts "getting log lines"
if opts[:log] != nil
history = get_file_log_lines(opts[:log])
elsif opts[:repo] != nil
history = get_git_log_lines(opts[:repo])
else
puts "error! no way to get log lines, must specify either location of repo or a log file"
exit(1)
end
$timer.log
parse_log(opts[:name], history, db)
update_relations(db)
end
main() if __FILE__ == $0