Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Collections and FileSets to CSV exports #408

Merged
merged 8 commits into from
Feb 8, 2022
2 changes: 1 addition & 1 deletion app/models/bulkrax/csv_file_set_entry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def add_path_to_file

parsed_metadata['file'][i] = path_to_file
end
raise ::StandardError, 'one or more file paths are invalid' unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?

parsed_metadata['file']
end
Expand Down
66 changes: 50 additions & 16 deletions app/parsers/bulkrax/csv_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -177,40 +177,74 @@ def extra_filters
end

def current_work_ids
ActiveSupport::Deprication.warn('Bulkrax::CsvParser#current_work_ids will be replaced with #current_record_ids in version 3.0')
current_record_ids
end

def current_record_ids
@work_ids = []
@collection_ids = []
@file_set_ids = []
sephirothkod marked this conversation as resolved.
Show resolved Hide resolved

case importerexporter.export_from
when 'all'
ActiveFedora::SolrService.query("has_model_ssim:(#{Hyrax.config.curation_concerns.join(' OR ')}) #{extra_filters}", rows: 2_147_483_647).map(&:id)
@work_ids = ActiveFedora::SolrService.query("has_model_ssim:(#{Hyrax.config.curation_concerns.join(' OR ')}) #{extra_filters}", rows: 2_147_483_647).map(&:id)
@collection_ids = ActiveFedora::SolrService.query("has_model_ssim:Collection #{extra_filters}", rows: 2_147_483_647).map(&:id)
@file_set_ids = ActiveFedora::SolrService.query("has_model_ssim:FileSet #{extra_filters}", rows: 2_147_483_647).map(&:id)
when 'collection'
ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
@work_ids = ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
when 'worktype'
ActiveFedora::SolrService.query("has_model_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
@work_ids = ActiveFedora::SolrService.query("has_model_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
when 'importer'
entry_ids = Bulkrax::Importer.find(importerexporter.export_source).entries.pluck(:id)
complete_statuses = Bulkrax::Status.latest_by_statusable
.includes(:statusable)
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
set_ids_for_exporting_from_importer
end

@work_ids + @collection_ids + @file_set_ids
end
sephirothkod marked this conversation as resolved.
Show resolved Hide resolved

# Set the following instance variables: @work_ids, @collection_ids, @file_set_ids
# @see #current_record_ids
def set_ids_for_exporting_from_importer
entry_ids = Importer.find(importerexporter.export_source).entries.pluck(:id)
complete_statuses = Status.latest_by_statusable
.includes(:statusable)
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')

complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
extra_filters = extra_filters.presence || '*:*'
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
extra_filters = extra_filters.presence || '*:*'

ActiveFedora::SolrService.get(
{ :@work_ids => ::Hyrax.config.curation_concerns, :@collection_ids => [::Collection], :@file_set_ids => [::FileSet] }.each do |instance_var, models_to_search|
instance_variable_set(instance_var, ActiveFedora::SolrService.get(
extra_filters.to_s,
fq: "#{work_identifier}_sim:(#{complete_entry_identifiers.join(' OR ')})",
fq: [
"#{work_identifier}_sim:(#{complete_entry_identifiers.join(' OR ')})",
"has_model_ssim:(#{models_to_search.join(' OR ')})"
],
fl: 'id',
rows: 2_000_000_000
)['response']['docs'].map { |obj| obj['id'] }
)['response']['docs'].map { |obj| obj['id'] })
end
end

def create_new_entries
current_work_ids.each_with_index do |wid, index|
current_record_ids.each_with_index do |id, index|
break if limit_reached?(limit, index)
new_entry = find_or_create_entry(entry_class, wid, 'Bulkrax::Exporter')

this_entry_class = if @collection_ids.include?(id)
collection_entry_class
elsif @file_set_ids.include?(id)
file_set_entry_class
else
entry_class
end
new_entry = find_or_create_entry(this_entry_class, id, 'Bulkrax::Exporter')

begin
entry = Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
rescue => e
Rails.logger.info("#{e.message} was detected during export")
end

self.headers |= entry.parsed_metadata.keys if entry
end
end
Expand Down Expand Up @@ -267,7 +301,7 @@ def retrieve_cloud_files(files)

def write_files
CSV.open(setup_export_file, "w", headers: export_headers, write_headers: true) do |csv|
importerexporter.entries.where(identifier: current_work_ids)[0..limit || total].each do |e|
importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].each do |e|
csv << e.parsed_metadata
end
end
Expand Down
7 changes: 7 additions & 0 deletions spec/factories/bulkrax_exporters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,11 @@
limit { 0 }
field_mapping { nil }
end

trait :all do
name { 'Export from All' }
export_type { 'metadata' }
export_from { 'all' }
export_source { nil }
end
end
67 changes: 58 additions & 9 deletions spec/parsers/bulkrax/csv_parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -350,11 +350,13 @@ module Bulkrax
describe '#create_new_entries' do
subject(:parser) { described_class.new(exporter) }
let(:exporter) { FactoryBot.create(:bulkrax_exporter_worktype) }
# Use OpenStructs to simulate the behavior of ActiveFedora::SolrHit instances.
let(:work_ids_solr) { [OpenStruct.new(id: SecureRandom.alphanumeric(9)), OpenStruct.new(id: SecureRandom.alphanumeric(9))] }
let(:collection_ids_solr) { [OpenStruct.new(id: SecureRandom.alphanumeric(9))] }
let(:file_set_ids_solr) { [OpenStruct.new(id: SecureRandom.alphanumeric(9)), OpenStruct.new(id: SecureRandom.alphanumeric(9)), OpenStruct.new(id: SecureRandom.alphanumeric(9))] }

it 'invokes Bulkrax::ExportWorkJob once per Entry' do
# Use OpenStructs to simulate the behavior of ActiveFedora::SolrHit instances.
work_ids = [OpenStruct.new(id: SecureRandom.alphanumeric(9)), OpenStruct.new(id: SecureRandom.alphanumeric(9))]
expect(ActiveFedora::SolrService).to receive(:query).and_return(work_ids)
expect(ActiveFedora::SolrService).to receive(:query).and_return(work_ids_solr)
expect(Bulkrax::ExportWorkJob).to receive(:perform_now).exactly(2).times
parser.create_new_entries
end
Expand All @@ -363,9 +365,7 @@ module Bulkrax
let(:exporter) { FactoryBot.create(:bulkrax_exporter_worktype, limit: 1) }

it 'invokes Bulkrax::ExportWorkJob once' do
# Use OpenStructs to simulate the behavior of ActiveFedora::SolrHit instances.
work_ids = [OpenStruct.new(id: SecureRandom.alphanumeric(9)), OpenStruct.new(id: SecureRandom.alphanumeric(9))]
expect(ActiveFedora::SolrService).to receive(:query).and_return(work_ids)
expect(ActiveFedora::SolrService).to receive(:query).and_return(work_ids_solr)
expect(Bulkrax::ExportWorkJob).to receive(:perform_now).exactly(1).times
parser.create_new_entries
end
Expand All @@ -375,13 +375,62 @@ module Bulkrax
let(:exporter) { FactoryBot.create(:bulkrax_exporter_worktype, limit: 0) }

it 'invokes Bulkrax::ExportWorkJob once per Entry' do
# Use OpenStructs to simulate the behavior of ActiveFedora::SolrHit instances.
work_ids = [OpenStruct.new(id: SecureRandom.alphanumeric(9)), OpenStruct.new(id: SecureRandom.alphanumeric(9))]
expect(ActiveFedora::SolrService).to receive(:query).and_return(work_ids)
expect(ActiveFedora::SolrService).to receive(:query).and_return(work_ids_solr)
expect(Bulkrax::ExportWorkJob).to receive(:perform_now).exactly(2).times
parser.create_new_entries
end
end

context 'when exporting all' do
let(:exporter) { FactoryBot.create(:bulkrax_exporter, :all) }

before do
allow(ActiveFedora::SolrService).to receive(:query).and_return(work_ids_solr, collection_ids_solr, file_set_ids_solr)
end

it 'exports works, collections, and file sets' do
expect(ExportWorkJob).to receive(:perform_now).exactly(6).times

parser.create_new_entries
end

it 'exports all works' do
work_entry_ids = Entry.where(identifier: work_ids_solr.map(&:id)).map(&:id)
work_entry_ids.each do |id|
expect(ExportWorkJob).to receive(:perform_now).with(id, exporter.last_run.id).once
end

parser.create_new_entries
end

it 'exports all collections' do
collection_entry_ids = Entry.where(identifier: collection_ids_solr.map(&:id)).map(&:id)
collection_entry_ids.each do |id|
expect(ExportWorkJob).to receive(:perform_now).with(id, exporter.last_run.id).once
end

parser.create_new_entries
end

it 'exports all file sets' do
file_set_entry_ids = Entry.where(identifier: file_set_ids_solr.map(&:id)).map(&:id)
file_set_entry_ids.each do |id|
expect(ExportWorkJob).to receive(:perform_now).with(id, exporter.last_run.id).once
end

parser.create_new_entries
end

it 'exported entries are given the correct class' do
expect { parser.create_new_entries }
.to change(CsvFileSetEntry, :count)
.by(3)
.and change(CsvCollectionEntry, :count)
.by(1)
.and change(CsvEntry, :count)
.by(6) # 6 csv entries minus 3 file set entries minus 1 collection entry equals 2 work entries
end
end
end

describe '#path_to_files' do
Expand Down