-
Notifications
You must be signed in to change notification settings - Fork 1
/
iiif_print.rb
324 lines (295 loc) · 13.3 KB
/
iiif_print.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
require "iiif_print/engine"
require "iiif_print/errors"
require "iiif_print/jp2_image_metadata"
require "iiif_print/image_tool"
require "iiif_print/text_extraction"
require "iiif_print/data"
require "iiif_print/configuration"
require "iiif_print/base_derivative_service"
require "iiif_print/jp2_derivative_service"
require "iiif_print/pdf_derivative_service"
require "iiif_print/text_extraction_derivative_service"
require "iiif_print/text_formats_from_alto_service"
require "iiif_print/tiff_derivative_service"
require "iiif_print/lineage_service"
require "iiif_print/metadata"
require "iiif_print/split_pdfs/base_splitter"
require "iiif_print/split_pdfs/child_work_creation_from_pdf_service"
require "iiif_print/split_pdfs/derivative_rodeo_splitter"
require "iiif_print/split_pdfs/destroy_pdf_child_works_service"
require "iiif_print/persistence_layer"
require "iiif_print/persistence_layer/active_fedora_adapter"
require "iiif_print/persistence_layer/valkyrie_adapter"
# rubocop:disable Metrics/ModuleLength
module IiifPrint
extend ActiveSupport::Autoload
autoload :Configuration
autoload :CatalogSearchBuilder
##
# @api public
#
# Exposes the IiifPrint configuration.
#
# @yieldparam [IiifPrint::Configuration] config if a block is passed
# @return [IiifPrint::Configuration]
# @see IiifPrint::Configuration for configuration options
def self.config(&block)
@config ||= IiifPrint::Configuration.new
yield @config if block
@config
end
class << self
delegate(
:persistence_adapter,
:skip_splitting_pdf_files_that_end_with_these_texts,
to: :config
)
delegate(
:clean_for_tests!,
:copy_derivatives_from_data_store,
:create_relationship_between,
:destroy_children_split_from,
:extract_text_for,
:find_by,
:find_by_title_for,
:grandparent_for,
:index_works,
:object_in_works,
:object_ordered_works,
:parent_for,
:pdf?,
:save,
:solr_construct_query,
:solr_name,
:solr_query,
to: :persistence_adapter
)
end
# NOTE: We use lambdas so we can have default values but also provide a lazy configuration.
# There are certainly better ways but this is the least intrusive refactor from prior state.
DEFAULT_MODEL_CONFIGURATION = {
# Split a PDF into individual page images and create a new child work for each image.
pdf_splitter_job: -> { IiifPrint::Jobs::ChildWorksFromPdfJob },
pdf_splitter_service: -> { IiifPrint::SplitPdfs::PagesToJpgsSplitter },
derivative_service_plugins: lambda {
[
IiifPrint::TextExtractionDerivativeService
]
}
}.freeze
# This is the record level configuration for PDF split handling.
ModelConfig = Struct.new(:pdf_split_child_model, *DEFAULT_MODEL_CONFIGURATION.keys, keyword_init: true)
private_constant :ModelConfig
##
# @api public
# This method is responsible for configuring a model for additional derivative generation.
#
# @example
# class Book < ActiveFedora::Base
# include IiifPrint.model_configuration(
# pdf_split_child_model: Page,
# derivative_service_plugins: [
# IiifPrint::JP2DerivativeService,
# IiifPrint::PDFDerivativeService,
# IiifPrint::TextExtractionDerivativeService,
# IiifPrint::TIFFDerivativeService
# ]
# )
# end
#
# @param kwargs [Hash<Symbol,Object>] the configuration values that overrides the
# DEFAULT_MODEL_CONFIGURATION.
# @option kwargs [Array<Class>] derivative_service_plugins the various derivatives to run on the
# "original" files associated with this work. Options include:
# {IiifPrint::JP2DerivativeService}, {IiifPrint::PDFDerivativeService},
# {IiifPrint::TextExtractionDerivativeService}, {IiifPrint::TIFFDerivativeService}
# @option kwargs [Class] pdf_splitter_job responsible for handling the splitting of the original file
# @option kwargs [Class] pdf_split_child_model when we split the file into pages, what's the child model
# we want for those pages? Often times this is likely the same model as the parent.
# @option kwargs [Class] pdf_splitter_service the specific service that splits the PDF. Options are:
# {IiifPrint::SplitPdfs::PagesToJpgsSplitter},
# {IiifPrint::SplitPdfs::PagesToTiffsSplitter},
# {IiifPrint::SplitPdfs::PagesToPngsSplitter},
# {IiifPrint::SplitPdfs::DerivativeRodeoSplitter}
#
# @return [Module]
#
# @see IiifPrint::DEFAULT_MODEL_CONFIGURATION
# @todo Because not every job will split PDFs and write to a child model. May want to introduce
# an alternative splitting method to create new filesets on the existing work instead of new child works.
# rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
def self.model_configuration(**kwargs)
Module.new do
extend ActiveSupport::Concern
included do
work_type = self # In this case self is the class we're mixing the new module into.
# Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
indexer = if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_with_adapter_logic(work_type: work_type)
elsif work_type < ActiveFedora::Base
IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_with_adapter_logic(work_type: work_type)
else
raise "Unable to mix '.model_configuration' into #{work_type}"
end
# Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_form_with_adapter_logic(work_type: work_type)
elsif work_type < ActiveFedora::Base
IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_form_with_adapter_logic(work_type: work_type)
else
raise "Unable to mix '.model_configuration' into #{work_type}"
end
# Deriving lineage of objects is a potentially complicated thing. We provide a default
# service but each work_type's indexer can be configured by amending it's
# {.iiif_print_lineage_service}.
indexer.class_attribute(:iiif_print_lineage_service, default: IiifPrint::LineageService) unless indexer.respond_to?(:iiif_print_lineage_service)
work_type::GeneratedResourceSchema.send(:include, IiifPrint::SetChildFlag) if work_type.const_defined?(:GeneratedResourceSchema)
end
# We don't know what you may want in your configuration, but from this gems implementation,
# we're going to provide the defaults to ensure that it works.
DEFAULT_MODEL_CONFIGURATION.each_pair do |key, default_value|
kwargs[key] ||= default_value.call
end
define_method(:iiif_print_config) do
@iiif_print_config ||= ModelConfig.new(**kwargs)
end
def iiif_print_config?
true
end
end
end
# rubocop:enable Metrics/MethodLength
# @api public
#
# Map the given work's metadata to the given IIIF version spec's metadata structure. This
# is intended to be a drop-in replacement for `Hyrax::IiifManifestPresenter#manifest_metadata`.
#
# @param work [Object]
# @param version [Integer]
# @param fields [Array<IiifPrint::Metadata::Field>, Array<#name, #label>]
# @return [Array<Hash>]
#
# @see specs for expected output
#
# @see Hyrax::IiifManifestPresenter#manifest_metadata
def self.manifest_metadata_for(work:,
version: config.default_iiif_manifest_version,
fields: defined?(AllinsonFlex) ? fields_for_allinson_flex : default_fields,
current_ability:,
base_url:)
Metadata.build_metadata_for(work: work,
version: version,
fields: fields,
current_ability: current_ability,
base_url: base_url)
end
def self.manifest_metadata_from(work:, presenter:)
current_ability = presenter.try(:ability) || presenter.try(:current_ability)
base_url = presenter.try(:base_url) || presenter.try(:request)&.base_url
IiifPrint.manifest_metadata_for(work: work, current_ability: current_ability, base_url: base_url)
end
# Hash is an arbitrary attribute key/value pairs
# Struct is a defined set of attribute "keys". When we favor defined values,
# then we are naming the concept and defining the range of potential values.
Field = Struct.new(:name, :label, :options, keyword_init: true)
# @api private
# @todo Figure out a way to use a custom label, right now it takes it get rendered from the title.
def self.default_fields(fields: config.metadata_fields)
fields.map do |field|
Field.new(
name: field.first,
label: Hyrax::Renderers::AttributeRenderer.new(field.first, nil).label,
options: field.last
)
end
end
##
# @param fields [Array<IiifPrint::Field>]
def self.fields_for_allinson_flex(fields: allinson_flex_fields, sort_order: IiifPrint.config.iiif_metadata_field_presentation_order)
fields = sort_af_fields!(fields, sort_order: sort_order)
fields.each_with_object({}) do |field, hash|
# filters out admin_only fields
next if field.indexing&.include?('admin_only')
# WARNING: This is assuming A LOT
# This is taking the Allinson Flex fields that have the same name and only
# using the first one while discarding the rest. There currently no way to
# controller which one(s) are discarded but this fits for the moment.
next if hash.key?(field.name)
# currently only supports the faceted option
# Why the `render_as:`? This was originally derived from Hyku default attributes
# @see https://github.com/samvera/hyku/blob/c702844de4c003eaa88eb5a7514c7a1eae1b289e/app/views/hyrax/base/_attribute_rows.html.erb#L3
hash[field.name] = Field.new(
name: field.name,
label: field.value,
options: field.indexing&.include?('facetable') ? { render_as: :faceted } : nil
)
end.values
end
CollectionFieldShim = Struct.new(:name, :value, :indexing, keyword_init: true)
##
# @return [Array<IiifPrint::Field>]
def self.allinson_flex_fields
return @allinson_flex_fields if defined?(@allinson_flex_fields)
allinson_flex_relation = AllinsonFlex::ProfileProperty
.joins(:texts)
.where(allinson_flex_profile_texts: { name: 'display_label' })
.distinct
.select(:name, :value, :indexing)
flex_fields = allinson_flex_relation.to_a
unless allinson_flex_relation.exists?(name: 'collection')
collection_field = CollectionFieldShim.new(name: :collection, value: 'Collection', indexing: [])
flex_fields << collection_field
end
@allinson_flex_fields = flex_fields
end
##
# @param fields [Array<IiifPrint::Field>]
# @param sort_order [Array<Symbol>]
def self.sort_af_fields!(fields, sort_order:)
return fields if sort_order.blank?
fields.sort_by do |field|
sort_order_index = sort_order.index(field.name.to_sym)
sort_order_index.nil? ? sort_order.length : sort_order_index
end
end
##
# @api public
#
# @param work [ActiveFedora::Base]
# @param file_set [FileSet]
# @param locations [Array<String>]
# @param user [User]
#
# @return [Symbol] when none of the locations are to be split.
def self.conditionally_submit_split_for(work:, file_set:, locations:, user:, skip_these_endings: skip_splitting_pdf_files_that_end_with_these_texts)
locations = locations.select { |location| split_for_path_suffix?(location, skip_these_endings: skip_these_endings) }
return :no_pdfs_for_splitting if locations.empty?
# Hyrax::FileSet ids are Valkyrie::ID's which can't be passed, so we call id on that and get the string id
file_set_id = file_set.id.try(:id) || file_set.id
work_admin_set_id = work.admin_set_id.try(:id) || work.admin_set_id
work.try(:iiif_print_config)&.pdf_splitter_job&.perform_later(
file_set_id,
locations,
user,
work_admin_set_id,
0 # A no longer used parameter; but we need to preserve the method signature (for now)
)
end
##
# @api public
#
# @param path [String] the path, hopefully with an extension, to the file we're considering
# splitting.
# @param skip_these_endings [Array<#downcase>] the endings that we should skip for splitting
# purposes.
# @return [TrueClass] when the path is one we should split
# @return [FalseClass] when the path is one we should not split
#
# @see .skip_splitting_pdf_files_that_end_with_these_texts
def self.split_for_path_suffix?(path, skip_these_endings: skip_splitting_pdf_files_that_end_with_these_texts)
return false unless path.downcase.end_with?('.pdf')
return true if skip_these_endings.empty?
!path.downcase.end_with?(*skip_these_endings.map(&:downcase))
end
end
# rubocop:enable Metrics/ModuleLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity