Skip to content

Commit

Permalink
implemented #67 - improved corrupt data and error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
mdorf committed Apr 11, 2023
1 parent 3b9fdb0 commit 030930c
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 15 deletions.
8 changes: 3 additions & 5 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
GIT
remote: https://github.com/ncbo/goo.git
revision: 15023141f6051d4fa6cba6081d082c720327b0c9
revision: b8c7867450ec6ea2d3167eb9d9b1aed5614a1ce3
branch: develop
specs:
goo (0.0.2)
Expand All @@ -26,7 +26,7 @@ GIT

GIT
remote: https://github.com/ncbo/ontologies_linked_data.git
revision: 297f630ee5a35a78b015adf32fdb1e3af59ca652
revision: 79527b94fbb59081ba58281a5cd51ec3448fadb0
branch: develop
specs:
ontologies_linked_data (0.0.1)
Expand All @@ -46,7 +46,7 @@ GIT

GIT
remote: https://github.com/ncbo/sparql-client.git
revision: fb4a89b420f8eb6dda5190a126b6c62e32c4c0c9
revision: 55e7dbf858eb571c767bc67868f9af61663859cb
branch: develop
specs:
sparql-client (1.0.1)
Expand Down Expand Up @@ -235,9 +235,7 @@ GEM
webrick (1.8.1)

PLATFORMS
ruby
x86_64-darwin-18
x86_64-darwin-21

DEPENDENCIES
cube-ruby
Expand Down
108 changes: 99 additions & 9 deletions bin/ncbo_ontology_archive_old_submissions
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,121 @@ require_relative '../lib/ncbo_cron'
config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__))
abort("Please create a config/config.rb file using the config/config.rb.sample as a template") unless config_exists
require_relative '../config/config'
require 'optparse'

logfile = 'archive_old_submissions.log'
options = { delete: false }
opt_parser = OptionParser.new do |opts|
# Set a banner, displayed at the top of the help screen.
opts.banner = "Usage: #{File.basename(__FILE__)} [options]"

options[:logfile] = STDOUT
opts.on( '-l', '--logfile FILE', "Write log to FILE (default is STDOUT)" ) do |filename|
options[:logfile] = filename
end

# Delete submission if it contains bad data
opts.on( '-d', '--delete', "Delete submissions that contain bad data" ) do
options[:delete] = true
end

# Display the help screen, all programs are assumed to have this option.
opts.on( '-h', '--help', 'Display this screen' ) do
puts opts
exit
end
end

opt_parser.parse!
logfile = options[:logfile]
if File.file?(logfile); File.delete(logfile); end
logger = Logger.new(logfile)
options = { process_rdf: false, index_search: false, index_commit: false,
run_metrics: false, reasoning: false, archive: true }
process_actions = { process_rdf: false, generate_labels: false, index_search: false, index_commit: false,
process_annotator: false, diff: false, run_metrics: false, archive: true }
onts = LinkedData::Models::Ontology.all
onts.each { |ont| ont.bring(:acronym, :submissions) }
onts.sort! { |a,b| a.acronym <=> b.acronym }
onts.sort! { |a, b| a.acronym <=> b.acronym }
bad_submissions = {}

onts.each do |ont|
latest_sub = ont.latest_submission
if not latest_sub.nil?

unless latest_sub.nil?
id = latest_sub.submissionId
subs = ont.submissions
old_subs = subs.reject { |sub| sub.submissionId >= id }
old_subs.sort! { |a,b| a.submissionId <=> b.submissionId }

old_subs = subs.reject { |sub|
begin
sub.submissionId >= id
rescue => e
msg = "Invalid submission ID detected (String instead of Integer): #{ont.acronym}/#{sub.submissionId} - #{e.class}:\n#{e.backtrace.join("\n")}"
puts msg
logger.error(msg)

if options[:delete]
sub.delete if options[:delete]
msg = "Deleted submission #{ont.acronym}/#{sub.submissionId} due to invalid Submission ID"
puts msg
logger.error(msg)
end
bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "Invalid Submission ID"
true
end
}
old_subs.sort! { |a, b| a.submissionId <=> b.submissionId }
old_subs.each do |sub|
if not sub.archived?
unless sub.archived?
msg = "#{ont.acronym}: found un-archived old submission with ID #{sub.submissionId}."
puts msg
logger.info msg
NcboCron::Models::OntologySubmissionParser.new.process_submission(logger, sub.id.to_s, options)

begin
NcboCron::Models::OntologySubmissionParser.new.process_submission(logger, sub.id.to_s, process_actions)
rescue => e
if e.class == Goo::Base::NotValidException
if sub.valid?
msg = "Error archiving submission #{ont.acronym}/#{sub.submissionId} - #{e.class}:\n#{e.backtrace.join("\n")}"
puts msg
logger.error(msg)
bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "Submission passes valid check but cannot be saved"
else
msg = "Error archiving submission #{ont.acronym}/#{sub.submissionId}:\n#{JSON.pretty_generate(sub.errors)}"
puts msg
logger.error(msg)

if options[:delete]
sub.delete if options[:delete]
msg = "Deleted submission #{ont.acronym}/#{sub.submissionId} due to invalid data"
puts msg
logger.error(msg)
end
bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "Submission is not valid to be saved"
end
else
msg = "Error archiving submission #{ont.acronym}/#{sub.submissionId} - #{e.class}:\n#{e.backtrace.join("\n")}"
puts msg
logger.error(msg)

if options[:delete] && (e.class == Net::HTTPBadResponse || e.class == Errno::ECONNREFUSED)
sub.delete
msg = "Deleted submission #{ont.acronym}/#{sub.submissionId} due to a non-working pull URL"
puts msg
logger.error(msg)
end
bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "#{e.class} - Runtime error"
end
end
end
end
end
end

msg = JSON.pretty_generate(bad_submissions)
puts
puts msg
logger.error(msg)

msg = "Number of errored submissions: #{bad_submissions.length}"
puts msg
logger.error(msg)


2 changes: 1 addition & 1 deletion lib/ncbo_cron/ontology_submission_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def process_submission(logger, submission_id, actions=ACTIONS)

# Check to make sure the file has been downloaded
if sub.pullLocation && (!sub.uploadFilePath || !File.exist?(sub.uploadFilePath))
multi_logger.debug "Pull location found, but no file in the upload file path. Retrying download."
multi_logger.debug "Pull location found (#{sub.pullLocation}, but no file in the upload file path (#{sub.uploadFilePath}. Retrying download."
file, filename = sub.download_ontology_file
file_location = sub.class.copy_file_repository(sub.ontology.acronym, sub.submissionId, file, filename)
file_location = "../" + file_location if file_location.start_with?(".") # relative path fix
Expand Down

0 comments on commit 030930c

Please sign in to comment.