diff --git a/app/controllers/catalog_controller.rb b/app/controllers/catalog_controller.rb index 01940996a3..fcf381063f 100644 --- a/app/controllers/catalog_controller.rb +++ b/app/controllers/catalog_controller.rb @@ -41,7 +41,7 @@ def self.uploaded_field # IiifPrint index fields config.add_index_field 'all_text_timv' - config.add_index_field 'file_set_text_tsimv', label: "Item contents", highlight: true, helper_method: :render_ocr_snippets + config.add_index_field 'all_text_tsimv', label: "Item contents", highlight: true, helper_method: :render_ocr_snippets, if: :query_present? # configuration for Blacklight IIIF Content Search config.iiif_search = { @@ -83,7 +83,7 @@ def self.uploaded_field rows: 10, qf: ( IiifPrint.config.metadata_fields.keys.map { |attribute| "#{attribute}_tesim" } + - ["title_tesim", "description_tesim", "all_text_timv", "file_set_text_tsimv"] + ["title_tesim", "description_tesim", "all_text_timv", "all_text_tsimv"] ).uniq.join(' '), "hl": true, "hl.simple.pre": "", @@ -638,5 +638,9 @@ def show def render_bookmarks_control? false end + + def query_present? + params[:q].present? + end end # rubocop:enable Metrics/ClassLength, Metrics/BlockLength diff --git a/app/indexers/concerns/hyku_indexing.rb b/app/indexers/concerns/hyku_indexing.rb index 41a834a90a..17073eb6e7 100644 --- a/app/indexers/concerns/hyku_indexing.rb +++ b/app/indexers/concerns/hyku_indexing.rb @@ -21,8 +21,7 @@ module HykuIndexing solr_doc['account_institution_name_ssim'] = Site.instance.institution_label solr_doc['valkyrie_bsi'] = object.kind_of?(Valkyrie::Resource) solr_doc['member_ids_ssim'] = object.member_ids.map(&:id) if object.kind_of?(Valkyrie::Resource) - # TODO: Reinstate once valkyrie fileset work is complete - https://github.com/scientist-softserv/hykuup_knapsack/issues/34 - solr_doc['all_text_tsimv'] = full_text(object.file_sets.first&.id) if object.kind_of?(ActiveFedora::Base) + solr_doc['all_text_tsimv'] = extract_full_text(object) # rubocop:enable Style/ClassCheck solr_doc['title_ssim'] = SortTitle.new(object.title.first).alphabetical solr_doc['depositor_ssi'] = object.depositor @@ -38,12 +37,39 @@ module HykuIndexing private - def full_text(file_set_id) - return if !Flipflop.default_pdf_viewer? || file_set_id.blank? + def extract_full_text(object) + child_works = Hyrax.custom_queries.find_child_works(resource: object) + + if child_works.empty? + extract_text_from_pdf_directly(object) + else + file_set_texts = child_works_file_sets(child_works).map { |fs| all_text(fs) }.select(&:present?) + if file_set_texts.join.blank? + extract_text_from_pdf_directly(object) + else + file_set_texts.join("\n---------------------------\n") + end + end + end + + def extract_text_from_pdf_directly(object) + file_set_id = Hyrax.custom_queries.find_child_file_sets(resource: object).first&.id&.to_s + return if file_set_id.blank? SolrDocument.find(file_set_id)['all_text_tsimv'] end + def child_works_file_sets(child_works) + child_works.map { |child_work| Hyrax.custom_queries.find_child_file_sets(resource: child_work) }.flatten + end + + def all_text(fs) + text = IiifPrint::Data::WorkDerivatives.data(from: fs, of_type: 'txt') || '' + return text if text.empty? + + text.tr("\n", ' ').squeeze(' ') + end + def add_date(solr_doc) date_string = solr_doc['date_created_tesim']&.first return unless date_string diff --git a/app/indexers/hyku/indexers/file_set_indexer.rb b/app/indexers/hyku/indexers/file_set_indexer.rb new file mode 100644 index 0000000000..febbb2dd7d --- /dev/null +++ b/app/indexers/hyku/indexers/file_set_indexer.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +module Hyku + module Indexers + class FileSetIndexer < Hyrax::Indexers::FileSetIndexer + include Hyrax::Indexer(:bulkrax_metadata) + include Hyrax::Indexer(:hyku_file_set_metadata) + + def to_solr + return super unless Flipflop.default_pdf_viewer? + + super.tap do |solr_doc| + solr_doc['all_text_timv'] = solr_doc['all_text_tsimv'] = pdf_text + end + end + + private + + # rubocop:disable Metrics/MethodLength + def pdf_text + return unless resource.original_file&.pdf? + return unless resource.original_file&.content.is_a? String + + begin + text = IO.popen(['pdftotext', '-', '-'], 'r+b') do |pdftotext| + pdftotext.write(resource.original_file.content) + pdftotext.close_write + pdftotext.read + end + + text.tr("\n", ' ') + .squeeze(' ') + .encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') # remove non-UTF-8 characters + rescue Errno::ENOENT => e + raise e unless e.message.include?("No such file or directory - pdftotext") + Rails.logger.warn("`pdfinfo' is not installed; unable to extract text from the PDF's content") + end + end + # rubocop:enable Metrics/MethodLength + end + end +end diff --git a/config/initializers/hyrax.rb b/config/initializers/hyrax.rb index 01d5fe3801..f45a40cb1c 100644 --- a/config/initializers/hyrax.rb +++ b/config/initializers/hyrax.rb @@ -242,6 +242,8 @@ # essence a "super" method. original_translator = config.translate_id_to_uri config.translate_id_to_uri = ->(id) { original_translator.call(id.to_s) } + + config.file_set_indexer = Hyku::Indexers::FileSetIndexer end # rubocop:enable Metrics/BlockLength diff --git a/spec/fixtures/pdf/pdf_sample.pdf b/spec/fixtures/pdf/pdf_sample.pdf new file mode 100644 index 0000000000..774c2ea70c Binary files /dev/null and b/spec/fixtures/pdf/pdf_sample.pdf differ diff --git a/spec/indexers/hyku/indexers/file_set_indexer_spec.rb b/spec/indexers/hyku/indexers/file_set_indexer_spec.rb new file mode 100644 index 0000000000..5d0fef9682 --- /dev/null +++ b/spec/indexers/hyku/indexers/file_set_indexer_spec.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +RSpec.describe Hyku::Indexers::FileSetIndexer do + let(:indexer_class) { described_class } + let(:resource) { Hyrax.config.file_set_model.constantize.new } + let(:original_file) { Hyrax::FileMetadata.new } + + it 'is the configured file set indexer' do + expect(Hyrax.config.file_set_indexer).to eq described_class + end + + describe '#to_solr' do + let(:stream) { File.open('spec/fixtures/pdf/pdf_sample.pdf').read } + it 'indexes the text of a pdf that has text already' do + allow(Flipflop).to receive(:default_pdf_viewer?).and_return(true) + allow(resource).to receive(:original_file).and_return(original_file) + allow(original_file).to receive(:pdf?).and_return(true) + allow(original_file).to receive(:content).and_return(stream) + + expect(resource.to_solr['all_text_tsimv']).to include('Dummy PDF file') + end + end +end