Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

I769 snippets #2329

Merged
merged 5 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions app/controllers/catalog_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def self.uploaded_field

# IiifPrint index fields
config.add_index_field 'all_text_timv'
config.add_index_field 'file_set_text_tsimv', label: "Item contents", highlight: true, helper_method: :render_ocr_snippets
config.add_index_field 'all_text_tsimv', label: "Item contents", highlight: true, helper_method: :render_ocr_snippets, if: :query_present?

# configuration for Blacklight IIIF Content Search
config.iiif_search = {
Expand Down Expand Up @@ -83,7 +83,7 @@ def self.uploaded_field
rows: 10,
qf: (
IiifPrint.config.metadata_fields.keys.map { |attribute| "#{attribute}_tesim" } +
["title_tesim", "description_tesim", "all_text_timv", "file_set_text_tsimv"]
["title_tesim", "description_tesim", "all_text_timv", "all_text_tsimv"]
).uniq.join(' '),
"hl": true,
"hl.simple.pre": "<span class='highlight'>",
Expand Down Expand Up @@ -638,5 +638,9 @@ def show
def render_bookmarks_control?
false
end

def query_present?
params[:q].present?
end
end
# rubocop:enable Metrics/ClassLength, Metrics/BlockLength
34 changes: 30 additions & 4 deletions app/indexers/concerns/hyku_indexing.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ module HykuIndexing
solr_doc['account_institution_name_ssim'] = Site.instance.institution_label
solr_doc['valkyrie_bsi'] = object.kind_of?(Valkyrie::Resource)
solr_doc['member_ids_ssim'] = object.member_ids.map(&:id) if object.kind_of?(Valkyrie::Resource)
# TODO: Reinstate once valkyrie fileset work is complete - https://github.com/scientist-softserv/hykuup_knapsack/issues/34
solr_doc['all_text_tsimv'] = full_text(object.file_sets.first&.id) if object.kind_of?(ActiveFedora::Base)
solr_doc['all_text_tsimv'] = extract_full_text(object)
# rubocop:enable Style/ClassCheck
solr_doc['title_ssim'] = SortTitle.new(object.title.first).alphabetical
solr_doc['depositor_ssi'] = object.depositor
Expand All @@ -38,12 +37,39 @@ module HykuIndexing

private

def full_text(file_set_id)
return if !Flipflop.default_pdf_viewer? || file_set_id.blank?
def extract_full_text(object)
child_works = Hyrax.custom_queries.find_child_works(resource: object)

if child_works.empty?
extract_text_from_pdf_directly(object)
else
file_set_texts = child_works_file_sets(child_works).map { |fs| all_text(fs) }.select(&:present?)
if file_set_texts.join.blank?
extract_text_from_pdf_directly(object)
else
file_set_texts.join("\n---------------------------\n")
end
end
end

def extract_text_from_pdf_directly(object)
file_set_id = Hyrax.custom_queries.find_child_file_sets(resource: object).first&.id&.to_s
return if file_set_id.blank?

SolrDocument.find(file_set_id)['all_text_tsimv']
end

def child_works_file_sets(child_works)
child_works.map { |child_work| Hyrax.custom_queries.find_child_file_sets(resource: child_work) }.flatten
end

def all_text(fs)
text = IiifPrint::Data::WorkDerivatives.data(from: fs, of_type: 'txt') || ''
return text if text.empty?

text.tr("\n", ' ').squeeze(' ')
end

def add_date(solr_doc)
date_string = solr_doc['date_created_tesim']&.first
return unless date_string
Expand Down
42 changes: 42 additions & 0 deletions app/indexers/hyku/indexers/file_set_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# frozen_string_literal: true

module Hyku
module Indexers
class FileSetIndexer < Hyrax::Indexers::FileSetIndexer
include Hyrax::Indexer(:bulkrax_metadata)
include Hyrax::Indexer(:hyku_file_set_metadata)

def to_solr
return super unless Flipflop.default_pdf_viewer?

super.tap do |solr_doc|
solr_doc['all_text_timv'] = solr_doc['all_text_tsimv'] = pdf_text
end
end

private

# rubocop:disable Metrics/MethodLength
def pdf_text
return unless resource.original_file&.pdf?
return unless resource.original_file&.content.is_a? String

begin
text = IO.popen(['pdftotext', '-', '-'], 'r+b') do |pdftotext|
pdftotext.write(resource.original_file.content)
pdftotext.close_write
pdftotext.read
end

text.tr("\n", ' ')
.squeeze(' ')
.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') # remove non-UTF-8 characters
rescue Errno::ENOENT => e
raise e unless e.message.include?("No such file or directory - pdftotext")
Rails.logger.warn("`pdfinfo' is not installed; unable to extract text from the PDF's content")
end
end
# rubocop:enable Metrics/MethodLength
end
end
end
2 changes: 2 additions & 0 deletions config/initializers/hyrax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@
# essence a "super" method.
original_translator = config.translate_id_to_uri
config.translate_id_to_uri = ->(id) { original_translator.call(id.to_s) }

config.file_set_indexer = Hyku::Indexers::FileSetIndexer
end
# rubocop:enable Metrics/BlockLength

Expand Down
Binary file added spec/fixtures/pdf/pdf_sample.pdf
Binary file not shown.
23 changes: 23 additions & 0 deletions spec/indexers/hyku/indexers/file_set_indexer_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

RSpec.describe Hyku::Indexers::FileSetIndexer do
let(:indexer_class) { described_class }
let(:resource) { Hyrax.config.file_set_model.constantize.new }
let(:original_file) { Hyrax::FileMetadata.new }

it 'is the configured file set indexer' do
expect(Hyrax.config.file_set_indexer).to eq described_class
end

describe '#to_solr' do
let(:stream) { File.open('spec/fixtures/pdf/pdf_sample.pdf').read }
it 'indexes the text of a pdf that has text already' do
allow(Flipflop).to receive(:default_pdf_viewer?).and_return(true)
allow(resource).to receive(:original_file).and_return(original_file)
allow(original_file).to receive(:pdf?).and_return(true)
allow(original_file).to receive(:content).and_return(stream)

expect(resource.to_solr['all_text_tsimv']).to include('Dummy PDF file')
end
end
end
Loading