Skip to content

Commit

Permalink
🎁 Add logic for snippets when splitting PDFs
Browse files Browse the repository at this point in the history
This commit will add logic to add the ability to see search snippets
with PDFs that were split through IIIF Print.
  • Loading branch information
kirkkwang committed Sep 18, 2024
1 parent 54bf5d7 commit b666d84
Showing 1 changed file with 30 additions and 3 deletions.
33 changes: 30 additions & 3 deletions app/indexers/concerns/hyku_indexing.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ module HykuIndexing
solr_doc['valkyrie_bsi'] = object.kind_of?(Valkyrie::Resource)
solr_doc['member_ids_ssim'] = object.member_ids.map(&:id) if object.kind_of?(Valkyrie::Resource)
# TODO: Reinstate once valkyrie fileset work is complete - https://github.com/scientist-softserv/hykuup_knapsack/issues/34
solr_doc['all_text_tsimv'] = full_text(Hyrax.custom_queries.find_child_file_sets(resource: resource).first.id.to_s)
solr_doc['all_text_tsimv'] = full_text(object)
# rubocop:enable Style/ClassCheck
solr_doc['title_ssim'] = SortTitle.new(object.title.first).alphabetical
solr_doc['depositor_ssi'] = object.depositor
Expand All @@ -38,12 +38,39 @@ module HykuIndexing

private

def full_text(file_set_id)
return if !Flipflop.default_pdf_viewer? || file_set_id.blank?
def full_text(object)
child_works = Hyrax.custom_queries.find_child_works(resource: object)

if child_works.empty?
extract_text_from_pdf_directly(object)
else
file_set_texts = child_works_file_sets(child_works).map { |fs| all_text(fs) }.select(&:present?)
if file_set_texts.join.blank?
extract_text_from_pdf_directly(object)
else
file_set_texts.join("\n---------------------------\n")
end
end
end

def extract_text_from_pdf_directly(object)
file_set_id = Hyrax.custom_queries.find_child_file_sets(resource: object).first&.id&.to_s
return if file_set_id.blank?

SolrDocument.find(file_set_id)['all_text_tsimv']
end

def child_works_file_sets(child_works)
child_works.map { |child_work| Hyrax.custom_queries.find_child_file_sets(resource: child_work) }.flatten
end

def all_text(fs)
text = IiifPrint::Data::WorkDerivatives.data(from: fs, of_type: 'txt') || ''
return text if text.empty?

text.tr("\n", ' ').squeeze(' ')
end

def add_date(solr_doc)
date_string = solr_doc['date_created_tesim']&.first
return unless date_string
Expand Down

0 comments on commit b666d84

Please sign in to comment.