-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjss_audit.rake
534 lines (457 loc) · 18.5 KB
/
jss_audit.rake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
#Here are the commands for running the rake task in each environment
#Development: rake fedora:audit
#Test: RAILS_ENV=test bundle exec rake fedora:audit
#Prodcution: RAILS_ENV=production rake fedora:audit
require 'net/http'
#require 'rsolr'
require 'json'
require 'open3'
require 'fileutils'
require 'open-uri'
require 'csv'
require 'pathname'
require 'net/ssh'
#require 'rake/remote_task'
#base url of solr path
BASE_SOLR_URL = "http://127.0.0.1:8035/apache-solr-3.3.0/"
BASE_IMAGE_URL = "http://dcoll.library.yale.edu/fedora/objects/"
IMAGE_POST_URL = "/datastreams/JPG/content"
HOST = "172.18.56.56"
USER = "llz6"
PASS = "xxxxx"
#Email to send report to
EMAIL_RECIPIENT = 'maggie.zhao@yale.edu'
#LOG_FILE = "#{log_path}/purge_fedora_solr.log"
def get_title_level_bibs
result = ""
solr_query = "#{BASE_SOLR_URL}/select"
solr_query = solr_query + '?indent=on&fq=LEVEL:title&fl=BIBID,PID,TITLE,numFound&wt=json&start=0&rows=600'
solr_uri = URI(solr_query)
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
result_num = solr_json['response']['numFound'].to_i
solr_docs =solr_json['response']['docs']
#puts "-----------Found title level object #{result_num}------------------"
#puts "BIBID, Fedora PID, Title"
results = []
solr_docs.each do |p|
# puts "#{p['BIBID']}, #{p['PID']}, #{p['TITLE']}"
results.push p['BIBID']
result << p['BIBID'] + ","
end
#puts "#{result}"
results
end
#puts "BIBID Title_PIDS Volumes_PIDS Barcode: "
def get_volume_level(bibs)
results =[]
count = 0;
bibs.each do |b|
solr_query = "#{BASE_SOLR_URL}select"
#=solr_query = solr_query + "?indent=on&fq=LEVEL:volume AND BIBID:" + b + "&fl=BIBID,PARENTPID,PID, VOLUME, SIP,numFound&wt=json&start=0&rows=600"
#http://127.0.0.1:8035/apache-solr-3.3.0/select/?indent=on&fq=LEVEL:volume%20AND%20BIBID:911609&fl=numFound&wt=xml&start=0&rows=600
solr_uri = URI(solr_query)
#puts ("solr_query for getting volume levels : #{solr_query} for bibid: #{b}\n")
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
result_num = solr_json['response']['numFound'].to_i
solr_docs =solr_json['response']['docs']
count = count + result_num;
##puts "---BIBID: #{b}, Volume: #{result_num}"
#puts "BIBID, PID, VOLUME, SIP"
solr_docs.each do |p|
#results.push p['BIBID']
puts "#{b}, #{p['PID']}, #{p['VOLUME']}, #{p['SIP']}"
puts "#{p['SIP']}"
end
end
puts "total volume level: #{count}"
results
end
def get_volume_level_maura
results =[]
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=LEVEL:volume &fl=BIBID,PARENTPID,PID, VOLUME, SIP,numFound&sort=BIBID asc&wt=json&start=0&rows=4000"
#http://127.0.0.1:8035/apache-solr-3.3.0/select/?indent=on&fq=LEVEL:volume%20AND%20BIBID:911609&fl=numFound&wt=xml&start=0&rows=600
solr_uri = URI(solr_query)
#puts ("solr_query for getting volume levels : #{solr_query} for bibid: #{b}\n")
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
result_num = solr_json['response']['numFound'].to_i
solr_docs =solr_json['response']['docs']
puts "BIBID, Title_PIDS, Volumes_PIDS, Barcode "
solr_docs.each do |p|
puts "#{p['BIBID']}, #{p['PARENTPID']}, #{p['PID']}, #{p['SIP']}"
end
puts "total volume level: #{result_num}"
results
end
def get_volume_level_pids
results =[]
result = "";
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=LEVEL:volume &fl=BIBID,PID, VOLUME, SIP,numFound&wt=json&start=0&rows=600"
#http://127.0.0.1:8035/apache-solr-3.3.0/select/?indent=on&fq=LEVEL:volume%20AND%20BIBID:911609&fl=numFound&wt=xml&start=0&rows=600
solr_uri = URI(solr_query)
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
result_num = solr_json['response']['numFound'].to_i
solr_docs =solr_json['response']['docs']
solr_docs.each do |p|
#results.push p['BIBID']
## puts "#{b}, #{p['PID']}, #{p['VOLUME']}, #{p['SIP']}"
#puts "#{p['PID']}"
results.push(p['PID'])
result << p['PID'] + ","
end
#puts "total volume level: #{count}"
#results
# puts "#{result}"
puts "#{result_num}"
end
def get_page_level
results =[]
count = 0
bibCount = 0
bibs.each do |b|
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=LEVEL:page AND BIBID:" + b + "&fl=BIBID,PID, PARENTPID, VOLUME, PAGE,SIP,numFound&wt=json&start=0&rows=5000"
#http://127.0.0.1:8035/apache-solr-3.3.0/select/?indent=on&fq=LEVEL:page%20AND%20BIBID:911609&fl=numFound&wt=xml&start=0&rows=6000
bibCount = bibCount +1
solr_uri = URI(solr_query)
# puts ("solr_query for getting volume levels : #{solr_query} for bibid: #{b}\n")
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
result_num = solr_json['response']['numFound'].to_i
solr_docs =solr_json['response']['docs']
count= count + result_num
puts "BIBID: #{b}, pages: #{result_num} "
#puts "-----------solr doc #{solr_docs}"
puts "BIBID, PID, PARENTPID, PAGE, VOLUME, IMAGE_URL"
solr_docs.each do |p|
results.push p['BIBID']
puts "#{b}, #{p['PID']}, #{p['PARENTPID']},#{p['PAGE']},#{p['VOLUME']},#{BASE_IMAGE_URL}#{p['PID']}#{IMAGE_POST_URL}"
end
end
puts "total page level number #{count} total bib number #{bibCount}"
results
end
def get_page_level_pid(bibs)
result = ""
results =[]
count = 0
bibCount = 0
bibs.each do |b|
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=LEVEL:page AND BIBID:" + b + "&fl=BIBID,PID, PARENTPID, VOLUME, PAGE,SIP,numFound&wt=json&start=0&rows=3000000"
#http://127.0.0.1:8035/apache-solr-3.3.0/select/?indent=on&fq=LEVEL:page%20AND%20BIBID:911609&fl=numFound&wt=xml&start=0&rows=6000
bibCount = bibCount +1
solr_uri = URI(solr_query)
# puts ("solr_query for getting volume levels : #{solr_query} for bibid: #{b}\n")
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
result_num = solr_json['response']['numFound'].to_i
solr_docs =solr_json['response']['docs']
count= count + result_num
#puts "BIBID: #{b}, pages: #{result_num} "
#puts "-----------solr doc #{solr_docs}"
#puts "BIBID, PID, PARENTPID, PAGE, VOLUME, IMAGE_URL"
solr_docs.each do |p|
#results.push p['pidid']
puts "#{p['PID']}"
result = result + p['PID'] + ","
end
end
#puts "total page level number #{count} total bib number #{bibCount}"
#put "#{result}"
end
def get_page_level_pid_2
result = ""
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=LEVEL:page &fl=BIBID,PID,numFound&wt=json&start=0&rows=3000000"
#http://127.0.0.1:8035/apache-solr-3.3.0/select/?indent=on&fq=LEVEL:page%20AND%20BIBID:911609&fl=numFound&wt=xml&start=0&rows=6000
solr_uri = URI(solr_query)
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
solr_docs =solr_json['response']['docs']
solr_docs.each do |p|
result << p['PID'] + ","
end
puts "#{result}"
end
def get_page_level_pid_3
results =[]
count = 0
bibCount = 0
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=LEVEL:page &fl=PARENT_PID,BIBID,PID,numFound&wt=json&start=0&rows=3000000"
solr_uri = URI(solr_query)
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
solr_docs =solr_json['response']['docs']
solr_docs.each do |p|
count +=1
results.push p['PID']
puts "pid: #{p['PID']}"
end
#puts "total #{count}"
end
def compare_csv_imagesAPI
pid_count = 0;
not_found_count = 0
error_pid=""
puts "check images in fedora"
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141121/datastreams/JPG/content
File.read('jss_pids_list.csv').split(",").each do |p|
pid_count += 1
#datastream="http://dcoll.library.yale.edu/fedora/objects/#{p}/datastreams/JPG/content"
datastream="http://dcoll.library.yale.edu/fedora/objects/#{p}/datastreams/JPG?format=xml&validateChecksum=true"
uri = URI.parse(datastream)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = datastream.start_with? "https"
begin
request = Net::HTTP::Get.new(uri.request_uri)
request.basic_auth("fedoraAdmin","fed%thrty5")
imageserver_response = http.request(request).body.upcase
#imageserver_response = http.get(uri.request_uri).body.upcase
if imageserver_response.include?("NO PATH IN DB REGISTRY FOR") #{p['PID']}]")
not_found_count +=1
error_pid << p['PID'] + ','
puts "NO PATH IN DB REGISTRY FOR [#{p}]"
else
# puts "find [ #{p}]"#p['PID']}]"
end
rescue Timeout::Error
error_pid.push j
print " Timeout::Error for #{p}"
end
end
puts "not found # #{not_found_count}"
#put "This pids are not found #{error_pid}"
end
def download_parent_childrenpids #example: pid=slavicbooks:116784
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=PARENTPID:slavicbooks\\:116784 AND LEVEL:page&fl=BIBID,PID,numFound&wt=json&start=0&rows=3000000"
#puts "#{solr_query}"
solr_uri = URI(solr_query)
error_pid=""
not_found_count = 0;
found_count = 0;
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
solr_docs =solr_json['response']['docs']
solr_docs.each do |p|
datastream="http://dcoll.library.yale.edu/fedora/objects/#{p['PID']}/datastreams/JPG/content"
uri = URI.parse(datastream)
#http = Net::HTTP.new()
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = datastream.start_with? "https"
begin
imageserver_response = http.get(uri.request_uri).body.upcase
if imageserver_response.include?("NO PATH IN DB REGISTRY FOR") #{p['PID']}]")
#not_found_pid.push p['PID']
not_found_count +=1
error_pid << p['PID'] + ','
puts "NO PATH IN DB REGISTRY FOR [#{p['PID']}]"
else
puts "find [ #{p['PID']}]"#p['PID']}]"
found_count +=1
end
rescue Timeout::Error
error_pid.push j
print " Timeout::Error for #{p['PID']}"
end
end
puts "not found # #{not_found_count}, found # #{found_count}"
end
def check_images_datastream
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=LEVEL:page &fl=BIBID,PID,numFound&wt=json&start=0&rows=3000000"
#http://127.0.0.1:8035/apache-solr-3.3.0/select/?indent=on&fq=LEVEL:page%20AND%20BIBID:911609&fl=numFound&wt=xml&start=0&rows=6000
# puts "#{solr_query}";
solr_uri = URI(solr_query)
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
solr_docs =solr_json['response']['docs']
pid_count = 0;
not_found_count = 0
not_found_pid = []
error_pid=""
puts "check images in fedora"
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141121/datastreams/JPG/content
solr_docs.each do |p|
pid_count += 1
#datastream = "https://imageserver.library.yale.edu/" + p + "/500.jpg?q=1"
datastream="http://dcoll.library.yale.edu/fedora/objects/#{p['PID']}/datastreams/JPG/content"
#puts "#{datastream}"
#server /usr/local/fed35/fedora_store_rss/data/datastreams
#datastream /wcsfs00.its.yale.internal/FC_Ladybird-807001-YUL/fedora_store_dcolltest/data/datastreams
uri = URI.parse(datastream)
#http = Net::HTTP.new()
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = datastream.start_with? "https"
begin
imageserver_response = http.get(uri.request_uri).body.upcase
if imageserver_response.include?("NO PATH IN DB REGISTRY FOR") #{p['PID']}]")
#not_found_pid.push p['PID']
not_found_count +=1
not_found_pid << p['PID'] + ','
puts "NO PATH IN DB REGISTRY FOR [#{p['PID']}]"
else
puts "find [ #{p['PID']}]"#p['PID']}]"
end
rescue Timeout::Error
error_pid.push j
print " Timeout::Error for #{p['PID']}"
end
end
puts "not found # #{not_found_count}"
put "This pids are not found #{not_found_pid}"
end
def check_sample_images
pid_count = 0;
image_count = 0;
not_found_count = 0
not_found_pid = []
error_pid = []
puts "check images in fedora"
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141121/datastreams/JPG/content
pids = ["slavicbooks:141110222","slavicbooks:141117","slavicbooks:141091","slavicbooks:140368","slavicbooks:140364"]
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141117/datastreams/JPG/content
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141096/datastreams/JPG/content
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141091/datastreams/JPG/content
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:140369/datastreams/JPG/content
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141122/datastreams/JPG/content
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:141090/datastreams/JPG/content
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:140368/datastreams/JPG/content
#http://dcoll.library.yale.edu/fedora/objects/slavicbooks:140364/datastreams/JPG/content
pids.each do |p|
pid_count += 1
#datastream = "https://imageserver.library.yale.edu/" + p + "/500.jpg?q=1"
#datastream="http://dcoll.library.yale.edu/fedora/objects/#{p}/datastreams/JPG/content"
datastream="http://dcoll.library.yale.edu/fedora/objects/#{p}/datastreams/JPG?
puts "#{datastream}"
#server /usr/local/fed35/fedora_store_rss/data/datastreams
#datastream /wcsfs00.its.yale.internal/FC_Ladybird-807001-YUL/fedora_store_dcolltest/data/datastreams
uri = URI.parse(datastream)
#http = Net::HTTP.new()
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = datastream.start_with? "https"
begin
#puts "#{http.get(uri.request_uri)}"
#request = http.get(uri.request_uri)
request = Net::HTTP::Get.new(uri.request_uri)
request.basic_auth("fedoraAdmin","fed%thrty5")
imageserver_response = http.request(request).body.upcase
#imageserver_response = http.get(uri.request_uri).body.upcase
puts "#{imageserver_response}"
if imageserver_response.include?("NO PATH IN DB REGISTRY FOR") #{p['PID']}]")
#not_found_pid.push p['PID']
not_found_count +=1
puts "NO PATH IN DB REGISTRY FOR [#{p}]"
error_pid.push(p)
else
puts "find [ #{p}]"#p['PID']}]"
end
rescue Timeout::Error
error_pid.push j
print " Timeout::Error for #{p}"
end
end
puts "#{not_found_count}"
puts "#{error_pid}"
end
def download_116784_image
#slavicbooks:4644
solr_query = "#{BASE_SOLR_URL}select"
solr_query = solr_query + "?indent=on&fq=PARENTPID:slavicbooks\\:116784 AND LEVEL:page&fl=BIBID,PID,numFound&wt=json&start=0&rows=30000"
solr_uri = URI(solr_query)
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
solr_docs =solr_json['response']['docs']
solr_docs.each do |p|
fedora_query = "http://127.0.0.1:8035/fedora/objects/#{p['PID']}/datastreams/JPG/content"
#http://127.0.0.1:8035/fedora/objects/slavicbooks:116915/datastreams/PDF/content
fedora_uri = URI(fedora_query)
fedora_response = Net::HTTP.get(fedora_uri)
download_image(fedora_query,"/Users/LixiaZhao/good/valkyrie-blacklight-poc/jss_images_116784/#{p['PID'].sub(':','-')}.jpg")
end
end
def download_image(url, dest)
open(url) do |u|
File.open(dest, 'wb') { |f| f.write(u.read) }
end
end
def ssh_digcoll
Net::SSH.start( HOST, USER, :password => PASS ) do|ssh|
#output = ssh.exec!("ls")
#output = ssh.exec('cd /fedora_store_rss/data/datastreams; grep -r "141096"')
#output = ssh.exec!('cd /fedora_store_rss/data/datastreams;grep -r "JPG"')
#output = ssh.exec!('cd /fedora_store_rss/data/datastreams;find . -ipath '*JPG*' -ipath '*slavicbooks_*'
#output = ssh.exec!('cd /fedora_store_rss/data/datastreams; find . -ipath "*slavicbooks*" -ipath "*JPG*"')
##output = ssh.exec!('cd /fedora_store_rss/data/datastreams; find . -ipath "*JPG*"')
#output = ssh.exec!('cd /fedora_store_rss/data/datastreams; find . -ipath "*slavicbooks*"') > local.csv
#output = ssh.exec!('cd /fedora_store_rss/data/datastreams; find . -ipath "*153457+JPG*"')
output = ssh.exec!('cd /fedora_store_rss/data/datastreams; find . -ipath "*PDF*"')
puts "#{output}"
end
end
def download_all_images
#solr_query = "#{BASE_SOLR_URL}select"
puts "download images from fedora systems"
File.read('volume_pids.csv').split(",").each do |p|
s = p.sub(':','-')
FileUtils.mkdir_p "/Users/LixiaZhao/good/jss_task/#{s}"
solr_query = "http://127.0.0.1:8035/apache-solr-3.3.0/select?indent=on&fq=PARENTPID:#{p.sub(':','\\:')} AND LEVEL:page&fl=BIBID,PID,numFound&wt=json&start=0&rows=10000"
puts "#{solr_query}"
solr_uri = URI(solr_query)
solr_response = Net::HTTP.get(solr_uri)
solr_json = JSON.parse("#{solr_response}")
solr_docs =solr_json['response']['docs']
solr_docs.each do |c|
fedora_query = "http://127.0.0.1:8035/fedora/objects/#{c['PID']}/datastreams/JPG/content"
#http://127.0.0.1:8035/fedora/objects/slavicbooks:116915/datastreams/PDF/content
#fedora_uri = URI(fedora_query)
#fedora_response = Net::HTTP.get(fedora_uri)
download_image(fedora_query,"/Users/LixiaZhao/good/jss_task/#{s}/#{c['PID'].sub(':','-')}.jpg")
end
end
end
namespace :jss_solr do
desc "Get the list of title level bib ids of JSS collection"
task :titleBIDs do
get_title_level_bibs
end
desc "Get volume level pids and save in a file"
task :volume do
get_volume_level_pids
end
desc "Get the list of page level pids of jss collection comma separated string"
task :page_children_comma do
get_page_level_pid_2
end
desc "Get the list of page level pids of jss collection csv array"
task :page_children_csv do
get_page_level_pid_3
end
desc "download 332 children images of a sample volume level pids"
task :download_Sample_Images do
download_116784_image
end
desc "scan fedora image file systems"
task :scan_file_system do
ssh_digcoll
end
desc "compare page_children_comma output pids with fedeora api to audit if all images are in the fedeora" #or compare two csv output with the python script
task :audit do
compare_csv_imagesAPI
end
desc "get bibid, titlepid, volumepid, barcode"
task :list_of_ids do
get_volume_level_maura
end
desc "Download all images and put in each volume pids folder"
task :download_all_images do
download_all_images
end
end