Skip to content

Commit

Permalink
Remove collection size feature
Browse files Browse the repository at this point in the history
While looking into #4100, we noticed that `CharacterizeJob` not only saves a file set's `original_file`, but also reindexes the file set, *and* reindexes every collection to which the file set's work belongs. It appears that this feature was added way back in the Sufia days in order to support the use case of displaying on the collection show page the total size of all files uploaded to a collection's works.

Why is this problematic?

At large scale, this means touching a potentially huge portion of the repository—iterating over all files in all works in a collection—in order to display an arguably useful string to a subset of repository users who care about such information. Doing this in `CharacterizeJob` introduces significant potential delay in the ingest pipeline, increasing how long it takes for a deposit to finish and render in the Hyrax UI (e.g., by delaying the creation of derivatives).

We propose removing this feature for now and encourage folks who need this feature to contribute it back with a better-performing, more streamlined design.
  • Loading branch information
mjgiarlo committed Jan 23, 2020
1 parent 1707bc0 commit 567cf77
Show file tree
Hide file tree
Showing 7 changed files with 2 additions and 73 deletions.
2 changes: 0 additions & 2 deletions app/indexers/hyrax/collection_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ def generate_solr_document
super.tap do |solr_doc|
# Makes Collections show under the "Collections" tab
solr_doc['generic_type_sim'] = ["Collection"]
# Index the size of the collection in bytes
solr_doc['bytes_lts'] = object.bytes
solr_doc['visibility_ssi'] = object.visibility

object.in_collections.each do |col|
Expand Down
1 change: 0 additions & 1 deletion app/jobs/characterize_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def characterize(file_set, _file_id, filepath)
file_set.characterization_proxy.alpha_channels = channels(filepath) if file_set.image? && Hyrax.config.iiif_image_server?
file_set.characterization_proxy.save!
file_set.update_index
file_set.parent&.in_collections&.each(&:update_index)
end

def channels(filepath)
Expand Down
30 changes: 0 additions & 30 deletions app/models/concerns/hyrax/collection_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,6 @@ def collection_type_gid_document_field_name
end
end

# Compute the sum of each file in the collection using Solr to
# avoid having to access Fedora
#
# @return [Fixnum] size of collection in bytes
# @raise [RuntimeError] unsaved record does not exist in solr
def bytes
return 0 if member_object_ids.empty?

raise "Collection must be saved to query for bytes" if new_record?

# One query per member_id because Solr is not a relational database
member_object_ids.collect { |work_id| size_for_work(work_id) }.sum
end

# @api public
# Retrieve the permission template for this collection.
# @return [Hyrax::PermissionTemplate]
Expand Down Expand Up @@ -169,22 +155,6 @@ def visibility_group
[]
end

# Calculate the size of all the files in the work
# @param work_id [String] identifer for a work
# @return [Integer] the size in bytes
def size_for_work(work_id)
argz = { fl: "id, #{file_size_field}",
fq: "{!join from=#{member_ids_field} to=id}id:#{work_id}" }
files = ::FileSet.search_with_conditions({}, argz)
files.reduce(0) { |sum, f| sum + f[file_size_field].to_i }
end

# Field name to look up when locating the size of each file in Solr.
# Override for your own installation if using something different
def file_size_field
"file_size_lts"
end

# Solr field name works use to index member ids
def member_ids_field
"member_ids_ssim"
Expand Down
8 changes: 1 addition & 7 deletions app/presenters/hyrax/collection_presenter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def collection_type
# Terms is the list of fields displayed by
# app/views/collections/_show_descriptions.html.erb
def self.terms
[:total_items, :size, :resource_type, :creator, :contributor, :keyword, :license, :publisher, :date_created, :subject,
[:total_items, :resource_type, :creator, :contributor, :keyword, :license, :publisher, :date_created, :subject,
:language, :identifier, :based_near, :related_url]
end

Expand All @@ -51,19 +51,13 @@ def terms_with_values

def [](key)
case key
when :size
size
when :total_items
total_items
else
solr_document.send key
end
end

def size
number_to_human_size(@solr_document['bytes_lts'])
end

def total_items
ActiveFedora::Base.where("member_of_collection_ids_ssim:#{id}").count
end
Expand Down
2 changes: 0 additions & 2 deletions spec/indexers/hyrax/collection_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
let(:doc) do
{
'generic_type_sim' => ['Collection'],
'bytes_lts' => 1000,
'thumbnail_path_ss' => '/downloads/1234?file=thumbnail',
'member_of_collection_ids_ssim' => [col1id, col2id],
'member_of_collections_ssim' => [col1title, col2title],
Expand All @@ -20,7 +19,6 @@

describe "#generate_solr_document" do
before do
allow(collection).to receive(:bytes).and_return(1000)
allow(collection).to receive(:in_collections).and_return([col1, col2])
allow(Hyrax::ThumbnailPathService).to receive(:call).and_return("/downloads/1234?file=thumbnail")
end
Expand Down
14 changes: 0 additions & 14 deletions spec/jobs/characterize_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,4 @@
expect { described_class.perform_now(file_set, file.id) }.to raise_error(StandardError, /original_file was not found/)
end
end

context "when the file set's work is in a collection" do
let(:work) { build(:generic_work) }
let(:collection) { build(:collection_lw) }

before do
allow(file_set).to receive(:parent).and_return(work)
allow(work).to receive(:in_collections).and_return([collection])
end
it "reindexes the collection" do
expect(collection).to receive(:update_index)
described_class.perform_now(file_set, file.id)
end
end
end
18 changes: 1 addition & 17 deletions spec/presenters/hyrax/collection_presenter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
subject { described_class.terms }

it do
is_expected.to eq [:total_items, :size, :resource_type, :creator,
is_expected.to eq [:total_items, :resource_type, :creator,
:contributor, :keyword, :license, :publisher,
:date_created, :subject, :language, :identifier,
:based_near, :related_url]
Expand All @@ -25,9 +25,6 @@
let(:presenter) { described_class.new(solr_doc, ability) }
let(:solr_doc) { SolrDocument.new(collection.to_solr) }

# Mock bytes so collection does not have to be saved.
before { allow(collection).to receive(:bytes).and_return(0) }

describe "collection type methods" do
subject { presenter }

Expand Down Expand Up @@ -81,7 +78,6 @@

it do
is_expected.to eq [:total_items,
:size,
:resource_type,
:keyword,
:date_created,
Expand Down Expand Up @@ -126,12 +122,6 @@
it { is_expected.to eq ['adc12v'] }
end

describe "#size", :clean_repo do
subject { presenter.size }

it { is_expected.to eq '0 Bytes' }
end

describe "#total_items", :clean_repo do
subject { presenter.total_items }

Expand Down Expand Up @@ -284,12 +274,6 @@
end
end

describe "#size", :clean_repo do
subject { presenter.size }

it { is_expected.to eq '0 Bytes' }
end

describe "#parent_collection_count" do
subject { presenter.parent_collection_count }

Expand Down

0 comments on commit 567cf77

Please sign in to comment.