Skip to content

Commit

Permalink
extracted text for valkyrie and wings (#6110)
Browse files Browse the repository at this point in the history
* renamed size and type to recorded_size and pcdm_use. use valkyrie for wings derivative creation. updated active_fedora_convertor t load extracted_text into fedora. modified derivative services to accept extracted text for wings derivatives. updated file_set indexer to save extracted_text_id_ssi. added a guard for the to_rdf Wings wrapper so that it cannot double wrap has_model.

---------

Co-authored-by: Rob Kaufman <rob@notch8.com>
  • Loading branch information
sephirothkod and orangewolf authored Aug 9, 2023
1 parent c3eb490 commit 50e1a25
Show file tree
Hide file tree
Showing 31 changed files with 167 additions and 76 deletions.
13 changes: 12 additions & 1 deletion .dassie/db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2021_11_30_181150) do
ActiveRecord::Schema.define(version: 2023_07_25_222727) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand Down Expand Up @@ -140,6 +140,17 @@
t.index ["machine_id"], name: "index_hyrax_collection_types_on_machine_id", unique: true
end

create_table "hyrax_counter_metrics", force: :cascade do |t|
t.string "worktype"
t.string "resource_type"
t.integer "work_id"
t.date "date"
t.integer "total_item_investigations"
t.integer "total_item_requests"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end

create_table "hyrax_default_administrative_set", force: :cascade do |t|
t.string "default_admin_set_id", null: false
t.datetime "created_at", null: false
Expand Down
6 changes: 6 additions & 0 deletions app/indexers/hyrax/file_set_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def generate_solr_document # rubocop:disable Metrics/AbcSize, Metrics/MethodLeng
solr_doc['original_checksum_tesim'] = object.original_checksum
solr_doc['alpha_channels_ssi'] = object.alpha_channels
solr_doc['original_file_id_ssi'] = original_file_id
solr_doc['extracted_text_id_ssi'] = extracted_text_id
solr_doc['generic_type_si'] = 'FileSet'
end
end
Expand All @@ -46,6 +47,11 @@ def original_file_id
Hyrax::VersioningService.versioned_file_id object.original_file
end

def extracted_text_id
return unless object.extracted_text
Hyrax::VersioningService.versioned_file_id object.extracted_text
end

def file_format
if object.mime_type.present? && object.format_label.present?
"#{object.mime_type.split('/').last} (#{object.format_label.join(', ')})"
Expand Down
7 changes: 4 additions & 3 deletions app/indexers/hyrax/valkyrie_file_set_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@ def to_solr # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Met

solr_doc['file_format_tesim'] = file_format(file_metadata)
solr_doc['file_format_sim'] = file_format(file_metadata)
solr_doc['file_size_lts'] = file_metadata.size[0]
solr_doc['type_tesim'] = file_metadata.type.map(&:to_s) if file_metadata.type.present?
solr_doc['file_size_lts'] = file_metadata.recorded_size[0]
solr_doc['type_tesim'] = file_metadata.pcdm_use.map(&:to_s) if file_metadata.pcdm_use.present?
solr_doc['pcdm_use_tesim'] = file_metadata.pcdm_use.map(&:to_s) if file_metadata.pcdm_use.present?

# attributes set by fits
solr_doc['format_label_tesim'] = file_metadata.format_label if file_metadata.format_label.present?
solr_doc['size_tesim'] = file_metadata.size if file_metadata.size.present?
solr_doc['size_tesim'] = file_metadata.recorded_size if file_metadata.recorded_size.present?
solr_doc['well_formed_tesim'] = file_metadata.well_formed if file_metadata.well_formed.present?
solr_doc['valid_tesim'] = file_metadata.valid if file_metadata.valid.present?
solr_doc['fits_version_tesim'] = file_metadata.fits_version if file_metadata.fits_version.present?
Expand Down
1 change: 0 additions & 1 deletion app/jobs/valkyrie_ingest_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def perform(file, pcdm_use: Hyrax::FileMetadata::Use::ORIGINAL_FILE)
def ingest(file:, pcdm_use:)
file_set_uri = Valkyrie::ID.new(file.file_set_uri)
file_set = Hyrax.query_service.find_by(id: file_set_uri)

updated_metadata = upload_file(
file: file,
file_set: file_set,
Expand Down
5 changes: 3 additions & 2 deletions app/models/concerns/hyrax/file_set/derivatives.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ module Derivatives
# # ./spec/jobs/create_derivatives_job_spec.rb:7:in `block (2 levels) in <top (required)>'
#
Hydra::Derivatives.source_file_service = Hyrax::LocalFileService
Hydra::Derivatives.output_file_service = Hyrax::PersistDerivatives
Hydra::Derivatives::FullTextExtract.output_file_service = Hyrax::PersistDirectlyContainedOutputFileService
Hydra::Derivatives.output_file_service = Hyrax::ValkyriePersistDerivatives
# Hydra::Derivatives::FullTextExtract.output_file_service = Hyrax::PersistDirectlyContainedOutputFileService
Hydra::Derivatives::FullTextExtract.output_file_service = Hyrax::ValkyriePersistDerivatives
before_destroy :cleanup_derivatives
# This completely overrides the version in Hydra::Works so that we
# read and write to a local file. It's important that characterization runs
Expand Down
13 changes: 6 additions & 7 deletions app/models/hyrax/file_metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def self.FileMetadata(file)
"with id #{file.id}. Initializing a new one")

FileMetadata.new(file_identifier: file.id,
alternative_ids: [file.id],
original_filename: File.basename(file.io))
end

Expand Down Expand Up @@ -67,11 +66,11 @@ def uri_for(use:)
attribute :label, ::Valkyrie::Types::Set
attribute :original_filename, ::Valkyrie::Types::String
attribute :mime_type, ::Valkyrie::Types::String.default(GENERIC_MIME_TYPE)
attribute :type, ::Valkyrie::Types::Set.default([Use::ORIGINAL_FILE].freeze) # Use += to add types, not <<
attribute :pcdm_use, ::Valkyrie::Types::Set.default([Use::ORIGINAL_FILE].freeze) # Use += to add pcdm_uses, not <<

# attributes set by fits
attribute :format_label, ::Valkyrie::Types::Set
attribute :size, ::Valkyrie::Types::Set
attribute :recorded_size, ::Valkyrie::Types::Set
attribute :well_formed, ::Valkyrie::Types::Set
attribute :valid, ::Valkyrie::Types::Set
attribute :date_created, ::Valkyrie::Types::Set
Expand Down Expand Up @@ -130,19 +129,19 @@ def uri_for(use:)
##
# @return [Boolean]
def original_file?
type.include?(Use::ORIGINAL_FILE)
pcdm_use.include?(Use::ORIGINAL_FILE)
end

##
# @return [Boolean]
def thumbnail_file?
type.include?(Use::THUMBNAIL)
pcdm_use.include?(Use::THUMBNAIL)
end

##
# @return [Boolean]
def extracted_file?
type.include?(Use::EXTRACTED_TEXT)
pcdm_use.include?(Use::EXTRACTED_TEXT)
end

def title
Expand All @@ -154,7 +153,7 @@ def download_id
end

def valid?
file.valid?(size: size.first, digests: { sha256: checksum&.first&.sha256 })
file.valid?(size: recorded_size.first, digests: { sha256: checksum&.first&.sha256 })
end

##
Expand Down
2 changes: 1 addition & 1 deletion app/services/hyrax/custom_queries/find_file_metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def find_many_file_metadata_by_use(resource:, use:)
return [] unless resource.try(:file_ids)

results = find_many_file_metadata_by_ids(ids: resource.file_ids)
results.select { |fm| fm.type.include?(use) }
results.select { |fm| fm.pcdm_use.include?(use) }
end
end
end
Expand Down
2 changes: 2 additions & 0 deletions app/services/hyrax/derivative_path.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ def extension
case destination_name
when 'thumbnail'
".#{MIME::Types.type_for('jpg').first.extensions.first}"
when 'extracted_text'
".#{MIME::Types.type_for('txt').first.extensions.first}"
else
".#{destination_name}"
end
Expand Down
5 changes: 3 additions & 2 deletions app/services/hyrax/file_set_derivatives_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def create_pdf_derivatives(filename)
url: derivative_url('thumbnail'),
layer: 0
}])
extract_full_text(filename, uri)
extract_full_text(filename, derivative_url('extracted_text'))
end

def create_office_document_derivatives(filename)
Expand All @@ -86,7 +86,7 @@ def create_office_document_derivatives(filename)
url: derivative_url('thumbnail'),
layer: 0
}])
extract_full_text(filename, uri)
extract_full_text(filename, derivative_url('extracted_text'))
end

def create_audio_derivatives(filename)
Expand Down Expand Up @@ -122,6 +122,7 @@ def derivative_path_factory
# @param [String] uri to the file set (deligated to file_set)
def extract_full_text(filename, uri)
return unless Hyrax.config.extract_full_text?

Hydra::Derivatives::FullTextExtract.create(filename,
outputs: [{ url: uri, container: "extracted_text" }])
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,33 @@ def self.call(content, directives)
# @option directives [String] url URI for the parent object.
def self.retrieve_file_set(directives)
uri = URI(directives.fetch(:url))
raise ArgumentError, "#{uri} is not an http(s) uri" unless uri.is_a?(URI::HTTP)
Hyrax.query_service.find_by_alternate_identifier(alternate_identifier: Hyrax::Base.uri_to_id(uri.to_s), use_valkyrie: false)
if uri.is_a?(URI::HTTP)
Hyrax.query_service.find_by_alternate_identifier(alternate_identifier: Hyrax::Base.uri_to_id(uri.to_s), use_valkyrie: false)
else
raise ArgumentError, "#{uri} is not an file/http(s) uri" unless uri.is_a?(URI::File)

fileset_for_directives(directives)
end
end
private_class_method :retrieve_file_set

# The filepath will look something like
# /app/samvera/hyrax-webapp/derivatives/95/93/tv/12/3-thumbnail.jpeg and
# we want to extract the FileSet id, which in this case would be 9593tv123
#
# @param [String] path
# @return [Hyrax::FileSet]
def self.fileset_for_directives(directives)
path = URI(directives.fetch(:url)).path
id = path.sub(Hyrax.config.derivatives_path.to_s, "")
.delete('/')
.match(/^(.*)-\w*(\.\w+)*$/) { |m| m[1] }
raise "Could not extract fileset id from path #{path}" unless id

Hyrax.metadata_adapter.query_service.find_by(id: id)
end
private_class_method :fileset_for_directives

# Override this implementation if you need a remote file from a different location
# @param file_set [FileSet] the container of the remote file
# @param directives [Hash] directions which can be used to determine where to persist to
Expand Down
21 changes: 12 additions & 9 deletions app/services/hyrax/valkyrie_persist_derivatives.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,17 @@ def self.call(stream,
file_set = fileset_for_directives(directives)

# Valkyrie storage adapters will typically expect an IO-like object that
# responds to #path -- here we only have a StringIO, so some
# transformation is in order
# responds to #rewind and #read so we have created a StringIO
tmpfile = Tempfile.new(file_set.id, encoding: 'ascii-8bit')
stream = StringIO.new(stream) if stream.is_a?(String)
stream.rewind
output = tmpfile.write(stream.read)
tmpfile.flush
raise 'blank file detected' if output.zero?

filename = filename(directives)
Hyrax.logger.debug "Uploading thumbnail for FileSet #{file_set.id} as #{filename}"
uploader.upload(
io: tmpfile,
filename: filename,
file_set: file_set,
use: Hyrax::FileMetadata::Use::THUMBNAIL
)
Hyrax.logger.debug "Uploading derivative for FileSet #{file_set.id} as #{filename}"
uploader.upload(io: tmpfile, filename: filename, file_set: file_set, use: file_metadata(directives))
end

# The filepath will look something like
Expand All @@ -59,5 +54,13 @@ def self.fileset_for_directives(directives)
def self.filename(directives)
URI(directives.fetch(:url)).path.split('/').last
end

def self.file_metadata(directives)
if directives.key?(:container)
"Hyrax::FileMetadata::Use::#{directives[:container].upcase}".constantize
else
Hyrax::FileMetadata::Use::THUMBNAIL
end
end
end
end
6 changes: 4 additions & 2 deletions app/services/hyrax/valkyrie_upload.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def upload(filename:, file_set:, io:, use: Hyrax::FileMetadata::Use::ORIGINAL_FI
streamfile = storage_adapter.upload(file: io, original_filename: filename, resource: file_set)
file_metadata = Hyrax::FileMetadata(streamfile)
file_metadata.file_set_id = file_set.id
file_metadata.type += [use]
file_metadata.pcdm_use = [use]
file_metadata.recorded_size = [io.size]

if use == Hyrax::FileMetadata::Use::ORIGINAL_FILE
# Set file set label.
Expand All @@ -50,6 +51,7 @@ def upload(filename:, file_set:, io:, use: Hyrax::FileMetadata::Use::ORIGINAL_FI
end

saved_metadata = Hyrax.persister.save(resource: file_metadata)
saved_metadata.original_filename = filename if saved_metadata.original_filename.blank?
Hyrax.publisher.publish("object.file.uploaded", metadata: saved_metadata)

add_file_to_file_set(file_set: file_set,
Expand Down Expand Up @@ -83,7 +85,7 @@ def add_file_to_file_set(file_set:, file_metadata:, user:)
# the file to add
# @return [void]
def set_file_use_ids(file_set, file_metadata)
file_metadata.type.each do |type|
file_metadata.pcdm_use.each do |type|
case type
when Hyrax::FileMetadata::Use::ORIGINAL_FILE
file_set.original_file_id = file_metadata.id
Expand Down
1 change: 1 addition & 0 deletions app/services/hyrax/versioning_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def versions
[]
end
else
return resource.versions if resource.versions.is_a?(Array)
resource.versions.all.to_a
end
end
Expand Down
40 changes: 27 additions & 13 deletions lib/wings/active_fedora_converter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -138,19 +138,30 @@ def apply_attributes_to_model(af_object)
when ActiveFedora::File
add_file_attributes(af_object)
else
converted_attrs = normal_attributes
members = Array.wrap(converted_attrs.delete(:members))
files = converted_attrs.delete(:files)
af_object.attributes = converted_attrs
perform_lease_conversion(af_object: af_object, resource: resource) if resource.try(:lease) && af_object.reflections.include?(:lease)
perform_embargo_conversion(af_object: af_object, resource: resource) if resource.try(:embargo) && af_object.reflections.include?(:embargo)
members.empty? ? af_object.try(:ordered_members)&.clear : af_object.try(:ordered_members=, members)
af_object.try(:members)&.replace(members)
af_object.files.build_or_set(files) if files
parse_attributes(af_object)
end
end
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength

def parse_attributes(af_object)
converted_attrs = normal_attributes
members = Array.wrap(converted_attrs.delete(:members))
files = converted_attrs.delete(:files)
af_object.attributes = converted_attrs
af_object.extracted_text = create_extrated_text(af_object) if resource.attributes[:extracted_text_id].present?
perform_lease_conversion(af_object: af_object, resource: resource)
perform_embargo_conversion(af_object: af_object, resource: resource)
members.empty? ? af_object.try(:ordered_members)&.clear : af_object.try(:ordered_members=, members)
af_object.try(:members)&.replace(members)
af_object.files.build_or_set(files) if files
end

def create_extrated_text(af_object)
pcdm_et_file = af_object.extracted_text.presence || af_object.create_extracted_text
pcdm_et_file.content = Hyrax.custom_queries.find_many_file_metadata_by_use(resource: resource, use: Hyrax::FileMetadata::Use::EXTRACTED_TEXT).first&.content
pcdm_et_file
end

# Add attributes from resource which aren't AF properties into af_object
def add_access_control_attributes(af_object)
normal_attributes[:permissions].each { |p| p.access_to_id = resource.access_to&.id }
Expand All @@ -159,17 +170,20 @@ def add_access_control_attributes(af_object)

# for files, add attributes to metadata_node, plus some other work
def add_file_attributes(af_object)
af_object.metadata_node.attributes = normal_attributes
converted_attrs = normal_attributes
pcdm_use = converted_attrs.delete(:pcdm_use)
af_object.metadata_node.attributes = converted_attrs
af_object.pcdm_use = pcdm_use.first if pcdm_use.present? && pcdm_use.first.present?
af_object.original_name = resource.original_filename
new_type = (resource.type - af_object.metadata_node.type.to_a).first
new_type = (resource.pcdm_use - af_object.metadata_node.type.to_a).first
af_object.metadata_node.type = new_type if new_type
af_object.mime_type = resource.mime_type
end

def perform_lease_conversion(af_object:, resource:)
# TODO(#6134): af_object.lease.class has the same name as resource.lease.class; however, each class has a different object_id
# so a type mismatch happens. the code below coerces the one object into the other
return if af_object.lease&.id
return if !resource.try(:lease) || !af_object.reflections.include?(:lease) || af_object.lease&.id

resource_lease_dup = af_object.reflections.fetch(:lease).klass.new(resource.lease.attributes.except(:id, :internal_resource, :created_at, :updated_at, :new_record))
af_object.lease = resource_lease_dup
Expand All @@ -178,7 +192,7 @@ def perform_lease_conversion(af_object:, resource:)
def perform_embargo_conversion(af_object:, resource:)
# TODO(#6134): af_object.embargo.class has the same name as resource.embargo.class; however, each class has a different object_id
# so a type mismatch happens. the code below coerces the one object into the other
return if af_object.embargo&.id
return if !resource.try(:embargo) || !af_object.reflections.include?(:embargo) || af_object.embargo&.id

resource_embargo_dup = af_object.reflections.fetch(:embargo).klass.new(resource.embargo.attributes.except(:id, :internal_resource, :created_at, :updated_at, :new_record))
af_object.embargo = resource_embargo_dup
Expand Down
2 changes: 1 addition & 1 deletion lib/wings/active_fedora_converter/default_work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def model_name(*)
end

def to_rdf_representation
"Wings(#{valkyrie_class})"
"Wings(#{valkyrie_class})" unless valkyrie_class&.to_s&.include?('Wings(')
end
alias inspect to_rdf_representation
alias to_s inspect
Expand Down
2 changes: 1 addition & 1 deletion lib/wings/active_fedora_converter/file_metadata_node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def model_name(*)
end

def to_rdf_representation
"Wings(#{valkyrie_class})"
"Wings(#{valkyrie_class})" unless valkyrie_class&.to_s&.include?('Wings(')
end
alias inspect to_rdf_representation
alias to_s inspect
Expand Down
Loading

0 comments on commit 50e1a25

Please sign in to comment.