Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch ingest task #3143

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
10edc43
Fix controlled vocabularies and filename
piyapongch Jul 28, 2021
b1dd2ab
Add lowercase languages field value
piyapongch Jul 28, 2021
0aef3f9
Convert visibility to lowercase
piyapongch Jul 29, 2021
5e7fcf1
Convert license to lowercase and replace - with _
piyapongch Jul 29, 2021
8aa9de3
Replace . - and space with _
piyapongch Jul 29, 2021
5dd9dc8
Fix character substitutions
piyapongch Jul 29, 2021
00e6144
Remove lowercase conversion on license
piyapongch Aug 3, 2021
0488aa1
Change type field to item_type
piyapongch Aug 5, 2021
db2f2c1
Remove comments
piyapongch Aug 11, 2021
ca892c0
Add legacy_thesis_ingest task
piyapongch Aug 13, 2021
5628cd8
Fix graduation_date field
piyapongch Aug 13, 2021
f5342e1
Modify URL in the report to use HOSTNAME env variable
piyapongch Aug 19, 2021
a4cccc1
Reverse change
piyapongch Aug 19, 2021
4d9cd4f
Revert "Reverse change"
piyapongch Aug 19, 2021
67ae640
- Modify to support legacy thesis template
piyapongch Sep 1, 2021
3fcc222
Remove log message and add ingest_errors data
piyapongch Sep 1, 2021
919a589
Fix data_submitted and nil graduation_term
piyapongch Sep 1, 2021
a910fac
Modify ingest error report generator
piyapongch Sep 1, 2021
b73fb90
Remove required on proquest field
piyapongch Sep 2, 2021
6328ecc
Add error_message and backtrace on ingest_error.csv report
piyapongch Sep 2, 2021
5449679
Reduce backtrace message to 10 lines
piyapongch Sep 2, 2021
5a0a052
Reduce backtrace message console display to only one line
piyapongch Sep 3, 2021
adf9df1
Add begin at the start of method
piyapongch Sep 8, 2021
0771408
Modify to multiple department separated by |
piyapongch Sep 13, 2021
9405d3d
Add ingest_error_report to thesis and item batch ingest and item titl…
piyapongch Sep 15, 2021
8b03852
Fix item_ingest, owner_id, subjects and date_created fields
piyapongch Oct 19, 2021
a1373a0
Add fedora3_uuid field for fedora3 thesis redirecting
piyapongch Nov 22, 2021
d42338b
Update controlled vocabulary according to v2.3.1
piyapongch Dec 10, 2021
d0ca8ec
Add generate_ingest_production report
piyapongch Jan 7, 2022
e9d14a2
Add fedora3_uuid field to legacy_thesis_ingest from thesis_ingest
piyapongch Jan 10, 2022
edc4d8b
Add multiple committee_members field support
piyapongch Jan 10, 2022
e04af2f
Add mode
piyapongch Jan 14, 2022
681025a
Remove mode
piyapongch Jan 17, 2022
dd6b15a
Add upload args to batch_ingest_items task
piyapongch Jan 20, 2022
38421f7
Remove upload args parameter
piyapongch Jan 21, 2022
4bea843
Change license_text to rights
piyapongch Feb 3, 2022
60cb82b
Merge branch 'master' into batch_ingest_task
piyapongch Jun 12, 2023
f7f50ff
Fix line too long
piyapongch Jun 12, 2023
bb03476
Fix line too long
piyapongch Jun 12, 2023
7fc7f8d
Fix white space
piyapongch Jun 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 177 additions & 60 deletions lib/tasks/batch_ingest.rake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ INDEX_OFFSET = 1
namespace :jupiter do
desc 'batch ingest for multiple items from a csv file - used by ERA Admin and ERA Assistants'
task :batch_ingest_items, [:csv_path] => :environment do |_t, args|
csv_path = args.csv_path
csv_path = args[:csv_path]
batch_ingest_csv(csv_path)
end

Expand All @@ -13,10 +13,20 @@ namespace :jupiter do
csv_path = args.csv_path
full_csv_path = File.expand_path(csv_path)
csv_directory = File.dirname(full_csv_path)
checksums = generate_checksums(csv_directory)

batch_ingest_csv(csv_path) do |object_data, index|
thesis_ingest(object_data, index, csv_directory, checksums)
thesis_ingest(object_data, index, csv_directory)
end
end

desc 'batch ingest for legacy theses from a csv file'
task :batch_ingest_legacy_theses, [:csv_path] => :environment do |_t, args|
csv_path = args.csv_path
full_csv_path = File.expand_path(csv_path)
csv_directory = File.dirname(full_csv_path)

batch_ingest_csv(csv_path) do |object_data, index|
legacy_thesis_ingest(object_data, index, csv_directory)
end
end
end
Expand All @@ -25,7 +35,6 @@ def batch_ingest_csv(csv_path)
require 'csv'
require 'fileutils'
log 'START: Batch ingest started...'

if csv_path.blank?
log 'ERROR: CSV path must be present. Please specify a valid csv_path as an argument'
exit 1
Expand All @@ -36,23 +45,35 @@ def batch_ingest_csv(csv_path)

if File.exist?(full_csv_path)
successful_ingested = []
ingest_errors = []
ingested_data = []
CSV.foreach(full_csv_path,
headers: true,
header_converters: :symbol,
converters: :all).with_index(INDEX_OFFSET) do |object_data, index|
object = if block_given?
yield(object_data, index)
else
item_ingest(object_data, index, csv_directory)
end
successful_ingested << object
object, e = if block_given?
yield(object_data, index)
else
item_ingest(object_data, index, csv_directory)
end
if object.is_a?(Item) || object.is_a?(Thesis)
successful_ingested << object
ingested_data << object_data
else
object[:error_message] = e.message
object[:backtrace] = e.backtrace.take(10).join("\n")
ingest_errors << object
end
end

generate_ingest_report(successful_ingested)

headers = CSV.read(full_csv_path, headers: true).headers
generate_ingest_production(ingested_data, headers)
headers << 'error_message'
headers << 'backtrace'
generate_ingest_errors(ingest_errors, headers)
log 'FINISH: Batch ingest completed!'
else
log "ERROR: Could not open file at `#{full_csv_path}`. Does the csv file exist at this location?"
log "ERROR: Could not open file at '#{full_csv_path}'. Does the csv file exist at this location?"
exit 1
end
end
Expand All @@ -62,40 +83,68 @@ def log(message)
end

def generate_ingest_report(successful_ingested_items)
log 'REPORT: Generating ingest report...'
log 'REPORT: Generating ingest success report...'

FileUtils.mkdir_p INGEST_REPORTS_LOCATION

file_name = Time.current.strftime('%Y_%m_%d_%H_%M_%S')
full_file_name = "#{INGEST_REPORTS_LOCATION}/#{file_name}.csv"
full_file_name = "#{INGEST_REPORTS_LOCATION}/#{file_name}_ingest_successes.csv"

CSV.open(full_file_name, 'wb', headers: true) do |csv|
csv << ['id', 'url', 'title'] # Add headers

successful_ingested_items.each do |item|
csv << [item.id, Rails.application.routes.url_helpers.item_url(item), item.title]
csv << [item.id,
Rails.application.routes.url_helpers.item_url(item).gsub('era-test',
ENV['HOSTNAME'].split('.')[0]), item.title]
end
end
log 'REPORT: Ingest success report generated!'
log "REPORT: You can view report here: #{full_file_name}"
end

def generate_ingest_errors(ingest_errors, headers)
log 'REPORT: Generating ingest error report...'
file_name = Time.current.strftime('%Y_%m_%d_%H_%M_%S')
full_file_name = "#{INGEST_REPORTS_LOCATION}/#{file_name}_ingest_errors.csv"
CSV.open(full_file_name, 'wb', headers: true) do |csv|
csv << headers
ingest_errors.each do |item|
csv << item
end
end
log 'REPORT: Ingest error report generated!'
log "REPORT: You can view report here: #{full_file_name}"
end

log 'REPORT: Ingest report successfully generated!'
def generate_ingest_production(ingested_data, headers)
log 'REPORT: Generating ingest production report...'
file_name = Time.current.strftime('%Y_%m_%d_%H_%M_%S')
full_file_name = "#{INGEST_REPORTS_LOCATION}/#{file_name}_ingest_production.csv"
CSV.open(full_file_name, 'wb', headers: true) do |csv|
csv << headers
ingested_data.each do |data|
csv << data
end
end
log 'REPORT: Ingest production report generated!'
log "REPORT: You can view report here: #{full_file_name}"
end

def item_ingest(item_data, index, csv_directory)
log "ITEM #{index}: Starting ingest of an item..."

item = Item.new
item.tap do |unlocked_obj|
unlocked_obj.owner_id = item_data[:owner_id]
unlocked_obj.owner_id = 1
unlocked_obj.title = item_data[:title]
unlocked_obj.alternative_title = item_data[:alternate_title]

if item_data[:type].present?
unlocked_obj.item_type = ControlledVocabulary.era.item_type.from_value(item_data[:type])
if item_data[:item_type].present?
unlocked_obj.item_type = ControlledVocabulary.era.item_type.from_value(item_data[:item_type])
end

# If item type is an article, we need to add an array of statuses to the publication status field...
if item_data[:type] == 'article' && ['draft', 'published'].include?(item_data[:publication_status])
if item_data[:item_type] == 'article' && ['draft', 'published'].include?(item_data[:publication_status])
unlocked_obj.publication_status = if item_data[:publication_status] == 'draft'
[
ControlledVocabulary.era.publication_status.draft,
Expand All @@ -109,24 +158,24 @@ def item_ingest(item_data, index, csv_directory)
end

if item_data[:languages].present?
unlocked_obj.languages = item_data[:languages].split('|').map do |language|
unlocked_obj.languages = item_data[:languages].downcase.split('|').map do |language|
ControlledVocabulary.era.language.from_value(language) if language.present?
end
end

unlocked_obj.creators = item_data[:creators].split('|') if item_data[:creators].present?
unlocked_obj.subject = item_data[:subjects].split('|') if item_data[:subjects].present?
unlocked_obj.created = item_data[:date_created].to_s
unlocked_obj.subject = item_data[:subject].split('|') if item_data[:subject].present?
unlocked_obj.created = item_data[:created].to_s
unlocked_obj.description = item_data[:description]

# Handle visibility and embargo logic
if item_data[:visibility].present?
unlocked_obj.visibility = ControlledVocabulary.jupiter_core.visibility.from_value(item_data[:visibility])
unlocked_obj.visibility = ControlledVocabulary.jupiter_core.visibility.from_value(item_data[:visibility].downcase)
end

if item_data[:visibility_after_embargo].present?
unlocked_obj.visibility_after_embargo =
ControlledVocabulary.jupiter_core.visibility.from_value(item_data[:visibility_after_embargo])
ControlledVocabulary.jupiter_core.visibility.from_value(item_data[:visibility_after_embargo].downcase)
end

unlocked_obj.embargo_end_date = item_data[:embargo_end_date].to_date if item_data[:embargo_end_date].present?
Expand All @@ -137,7 +186,7 @@ def item_ingest(item_data, index, csv_directory)
ControlledVocabulary.era.license.from_value(item_data[:license]) ||
ControlledVocabulary.era.old_license.from_value(item_data[:license])
end
unlocked_obj.rights = item_data[:license_text]
unlocked_obj.rights = item_data[:rights]

# Additional fields
unlocked_obj.contributors = item_data[:contributors].split('|') if item_data[:contributors].present?
Expand All @@ -156,27 +205,27 @@ def item_ingest(item_data, index, csv_directory)

log "ITEM #{index}: Starting ingest of file for item..."

# We only support for single file ingest, but this could easily be refactored for multiple files
File.open("#{csv_directory}/#{item_data[:file_name]}", 'r') do |file|
item.add_and_ingest_files([file])
# Suport multiple file_name with '|' seperated
item_data[:file_name].tr('\\', '/').split('|').each do |file_name|
log "ITEM #{index}: Uploading file: #{file_name}..."
File.open("#{csv_directory}/#{file_name}", 'r') do |file|
item.add_and_ingest_files([file])
end
end

log "ITEM #{index}: Setting thumbnail for item..."

item.set_thumbnail(item.files.first) if item.files.first.present?

log "ITEM #{index}: Successfully ingested an item! Item ID: `#{item.id}`"

log "ITEM #{index}: Successfully ingested an item! Item ID: '#{item.id}', #{item.title}"
item
rescue StandardError => e
log 'ERROR: Ingest of item failed! The following error occured:'
log "ERROR: Ingest of item by #{item_data[:title]} failed! The following error occurred:"
log "EXCEPTION: #{e.message}"
log 'WARNING: Please be careful with rerunning batch ingest! Duplication of items may happen ' \
'if previous items were successfully deposited.'
exit 1
log "BACKTRACE: #{e.backtrace.take(1).join("\n")}"
[item_data, e]
end

def thesis_ingest(thesis_data, index, csv_directory, checksums)
def thesis_ingest(thesis_data, index, csv_directory)
log "THESIS #{index}: Starting ingest of a thesis..."
thesis = Thesis.new
thesis.tap do |unlocked_obj|
Expand Down Expand Up @@ -255,40 +304,108 @@ def thesis_ingest(thesis_data, index, csv_directory, checksums)
unlocked_obj.save!
end

log "THESIS #{index}: Identifying file by checksum ..."
file_name = checksums[thesis_data[:md5]]
puts file_name

log "THESIS #{index}: Starting ingest of file for thesis..."

# We only support for single file ingest, but this could easily be refactored for multiple files
File.open("#{csv_directory}/#{file_name}", 'r') do |file|
File.open("#{csv_directory}/#{thesis_data[:file_name]}", 'r') do |file|
thesis.add_and_ingest_files([file])
end

log "THESIS #{index}: Setting thumbnail for thesis..."

thesis.set_thumbnail(thesis.files.first) if thesis.files.first.present?

log "THESIS #{index}: Successfully ingested an thesis! Thesis ID: `#{thesis.id}`"

log "THESIS #{index}: Successfully ingested an thesis! Thesis ID: '#{thesis.id}', #{thesis.title}"
thesis
rescue StandardError => e
log "ERROR: Ingest of thesis by #{thesis_data[:author]} failed! The following error occured:"
log "ERROR: Ingest of thesis #{thesis_data[:title]} failed! The following error occurred:"
log "EXCEPTION: #{e.message}"
log 'WARNING: Please be careful with rerunning batch ingest! Duplication of theses may happen ' \
'if previous theses were successfully deposited.'
exit
log "BACKTRACE: #{e.backtrace.take(1).join("\n")}"
[thesis_data, e]
end

def generate_checksums(csv_directory)
require 'digest/md5'
checksums = {}
Dir.glob("#{csv_directory}/*.pdf").each do |f|
checksum = Digest::MD5.hexdigest(File.read(f))
checksums[checksum] = File.basename(f)
def legacy_thesis_ingest(thesis_data, index, csv_directory)
log "THESIS #{index}: Starting ingest of a legacy thesis..."
thesis = Thesis.new
thesis.tap do |unlocked_obj|
# constant fields
unlocked_obj.owner_id = 1
unlocked_obj.depositor = 'erahelp@ualberta.ca'
unlocked_obj.institution = ControlledVocabulary.era.institution.from_value(:uofa)

# legacy thesis template fields
unlocked_obj.proquest = thesis_data[:proquest] if thesis_data[:proquest].present?
unlocked_obj.dissertant = thesis_data[:dissertant]
unlocked_obj.title = thesis_data[:title]
unlocked_obj.alternative_title = thesis_data[:alternative_title] if thesis_data[:alternative_title].present?
if thesis_data[:language].present?
unlocked_obj.language =
ControlledVocabulary.era.language.from_value(thesis_data[:language].downcase)
end
unlocked_obj.subject = thesis_data[:subject].split('|')
unlocked_obj.abstract = thesis_data[:abstract] if thesis_data[:abstract].present?
unlocked_obj.thesis_level = thesis_data[:thesis_level]
unlocked_obj.degree = thesis_data[:degree]
unlocked_obj.departments = thesis_data[:departments].split('|')
unlocked_obj.specialization = thesis_data[:specialization] if thesis_data[:specialization].present?

# Assumes the data received always have the graduation date follow the pattern of
# "Fall yyyy" -> yyyy-11 or "Spring yyyy" -> yyyy-06 also accept "yyyy"
if thesis_data[:graduation_date].is_a? Integer
unlocked_obj.graduation_date = thesis_data[:graduation_date].to_s
else
graduation_year_array = thesis_data[:graduation_date]&.match(/\d\d\d\d/)
graduation_year = graduation_year_array[0]
graduation_term_array = thesis_data[:graduation_date]&.match(/Fall|Spring/)
graduation_term_string = graduation_term_array[0]
graduation_term = '11' if graduation_term_string == 'Fall'
graduation_term = '06' if graduation_term_string == 'Spring'
unlocked_obj.graduation_date = "#{graduation_year}-#{graduation_term}"
end
unlocked_obj.supervisors = thesis_data[:supervisors].split('|') if thesis_data[:supervisors].present?
if thesis_data[:committee_members].present?
unlocked_obj.committee_members = thesis_data[:committee_members].split('|')
end
unlocked_obj.rights = thesis_data[:rights]
if thesis_data[:date_submitted].present?
unlocked_obj.date_submitted = Date.strptime(thesis_data[:date_submitted].to_s,
'%Y-%m-%d')
end
if thesis_data[:date_accepted].present?
unlocked_obj.date_accepted = Date.strptime(thesis_data[:date_accepted].to_s,
'%Y-%m-%d')
end
if thesis_data[:embargo_end_date].present?
unlocked_obj.embargo_end_date = Date.strptime(thesis_data[:embargo_end_date].to_s,
'%Y-%m-%d')
end
if thesis_data[:visibility_after_embargo].present?
unlocked_obj.visibility_after_embargo = ControlledVocabulary.jupiter_core.visibility.from_value
(thesis_data[:visibility_after_embargo])
end
unlocked_obj.visibility = ControlledVocabulary.jupiter_core.visibility.from_value(thesis_data[:visibility])
unlocked_obj.add_to_path(thesis_community_id, thesis_collection_id)

# Add extra field, fedora3_uuid for fedora3 thesis redirecting
# Ex: uuid:4cc160fc-a141-410f-a64d-d0119ad0b9fb
unlocked_obj.fedora3_uuid = thesis_data[:fedora3_uuid] if thesis_data[:fedora3_uuid].present?

# save thesis object
unlocked_obj.save!
end
checksums

log "THESIS #{index}: Starting ingest of file for legacy thesis..."

# We only support for single file ingest, but this could easily be refactored for multiple files
File.open("#{csv_directory}/#{thesis_data[:file_name]}", 'r') do |file|
thesis.add_and_ingest_files([file])
end
log "THESIS #{index}: Setting thumbnail for legacy thesis..."
thesis.set_thumbnail(thesis.files.first) if thesis.files.first.present?
log "THESIS #{index}: Successfully ingested an legacy thesis! Thesis ID: '#{thesis.id}', #{thesis.title}"
thesis
rescue StandardError => e
log "ERROR: Ingest of legacy thesis #{thesis_data[:title]} failed! The following error occurred:"
log "EXCEPTION: #{e.message}"
log "BACKTRACE: #{e.backtrace.take(1).join("\n")}"
[thesis_data, e]
end

def thesis_community_id
Expand Down