Skip to content
This repository has been archived by the owner on Oct 24, 2024. It is now read-only.

Commit

Permalink
Merge pull request #689 from scientist-softserv/i681
Browse files Browse the repository at this point in the history
🚧 Add incomplete FileSetsReprocessJob
  • Loading branch information
jeremyf authored Dec 1, 2023
2 parents b80a1e7 + 54d7cf8 commit 8a052bd
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 2 deletions.
130 changes: 130 additions & 0 deletions app/jobs/file_sets_reprocess_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# frozen_string_literal: true

##
# This job is responsible for finding file sets that may need re-processing and then dispatching new
# jobs to perform that processing.
#
# The reasons are two fold, and addressed by the two jobs:
#
# 1. We did not successfully split a PDF; handled by ConditionallyResplitFileSetJob
# 2. We did not successfully attach a PDF; handled by ConditionallyResplitFileSetJob
class FileSetsReprocessJob < ApplicationJob
##
# @param cname [String, Symbol] when given :all, submit one {FileSetsReprocessJob} per tenant.
# Otherwise, switch to the given tenant and submit a {FileSetsReprocessJob}
def self.for_tenant(cname = :all)
if cname == :all
Account.all.each do |account|
account.switch!
FileSetsReprocessJob.perform_later
end
else
Account.switch!(cname)
FileSetsReprocessJob.perform_later
end
end

class_attribute :solr_page_size, default: 1000
class_attribute :solr_q_parameter,
default: "(mime_type_ssi:application/pdf OR label_ssi:*.pdf) AND has_model_ssim:FileSet"
class_attribute :solr_fl_parameter, default: 'id,label_ssi,mime_type_ssi'
class_attribute :desired_mime_type, default: "application/pdf"

def perform
count = ActiveFedora::SolrService.count(solr_q_parameter)

(0..(1 + (count / solr_page_size))).each do |page|
ActiveFedora::SolrService.query(solr_q_parameter,
fl: solr_fl_parameter,
rows: solr_page_size,
start: page * solr_page_size).each do |document|
if document[:mime_type_ssi] == desired_mime_type
# Given that we have a mime_type we can assume that we've successfully attached the file.
ConditionallyResplitFileSetJob.perform_later(file_set_id: document[:id])
else
# We have failed to attach the file to the work.
ConditionallyReingestFileSetJob.perform_later(file_set_id: document[:id])
end
end
end
end

##
# A helper module for conditionally finding a file set.
#
# @see #find
module FileSetFinder
##
# @param file_set_id [String]
# @return [FileSet] when the given :file_set_id is found.
# @return [FalseClass] when the given :file_set_id is not found.
def self.find(file_set_id:)
FileSet.find(file_set_id)
rescue ActiveFedora::ObjectNotFoundError
message = "#{self.class}##{__method__} unable to find FileSet with ID=#{file_set_id}. " \
"It may have been deleted between the enqueuing of this job and running this job."
Rails.logger.warning(message)
return false
end
end

##
# This job conditionally re-splits a file_set's PDF. How do we know if we need to re-split
# it? See the {#perform} method for details.
#
# 1. The file_set is a PDF.
# 2. The file_set's PDF is one that we would normally split.
# 3. The file_set's parent does not have child works; the assumption being that if it doesn't
# have child works, then
class ConditionallyResplitFileSetJob < ApplicationJob
##
# @param file_set_id [String]
#
# @return [Symbol] A terse explanation of what was done with this job.
#
# @raise [ActiveFedora::ObjectNotFoundError] when the given FileSet's parent could not be found.
# rubocop:disable Metrics/LineLength
def perform(file_set_id:)
file_set = FileSetFinder.find(file_set_id: file_set_id)

# We've logged this (see FileSetFinder.find) so we'll move along.
return :file_set_not_found unless file_set

# When we aren't working with a PDF, let's not proceed.
return :not_a_pdf unless file_set.pdf?

# When the PDF we are working with isn't something we split, let's bail.
return :non_splitting_pdf unless IiifPrint::SplitPdfs::AdventistPagesToJpgsSplitter.split_this?(path: file_set.label)

parent = IiifPrint.parent_for(file_set)

raise ActiveFedora::ObjectNotFoundError, "Expected #{file_set.class} ID=#{file_set.id} to have a parent record." unless parent

return :parent_does_not_split unless parent.try(:iiif_print_config).try(:pdf_splitter_service)

# When the parent has children, assume that we've already previously succeeded on splitting
# this PDF.
return :has_children if parent.child_work_ids.any?

IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set: file_set, user: User.batch_user)
:requesting_split
end
# rubocop:enable Metrics/LineLength
end

##
#
class ConditionallyReingestFileSetJob < ApplicationJob
##
# @param file_set_id [String]
# @return [Symbol] A terse explanation of what was done with this job.
def perform(file_set_id:)
file_set = FileSetFinder.find(file_set_id: file_set_id)

# We've logged this (see FileSetFinder.find) so we'll move along.
return :file_set_not_found unless file_set

# TODO: The file set does not appear to have a properly attached file.
end
end
end
15 changes: 13 additions & 2 deletions lib/iiif_print/split_pdfs/adventist_pages_to_jpgs_splitter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,24 @@
module IiifPrint
module SplitPdfs
module AdventistPagesToJpgsSplitter
##
# @param path [String] the path, in particular filename (that hopefully ends with an
# extension).
#
# @param suffixes [Array<String>] the list of suffixes that we want to ignore for splitting.
# @return [TrueClass] when we should be splitting this path.
# @return [TrueClass] when we should not be splitting this path.
def self.split_this?(path:, suffixes: CreateDerivativesJobDecorator::NON_ARCHIVAL_PDF_SUFFIXES)
suffixes.none? { |suffix| path.downcase.end_with?(suffix) }
end

##
# We do not always want to split a PDF; this provides a decision point.
#
# @param path [String] the path of the file we're attempting to run derivatives against.
# @param args [Array<Object>] pass through args
# @param splitter [IiifPrint::SplitPdfs::BaseSplitter] (for dependency injection)
# @param suffix [String] (for dependency injection)
# @param suffixes [String] (for dependency injection)
#
# @return [Enumerable] when we are going to skip splitting, return an empty array; otherwise return
# an instance of {IiifPrint::SplitPdfs::AdventistPagesToJpgsSplitter}.
Expand All @@ -20,7 +31,7 @@ def self.call(path,
splitter: DerivativeRodeoSplitter,
suffixes: CreateDerivativesJobDecorator::NON_ARCHIVAL_PDF_SUFFIXES,
**args)
return [] if suffixes.any? { |suffix| path.downcase.end_with?(suffix) }
return [] unless AdventistPagesToJpgsSplitter.split_this?(path: path, suffixes: suffixes)

splitter.call(path, **args)
end
Expand Down
14 changes: 14 additions & 0 deletions spec/factories/file_sets.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,23 @@
factory :file_set do
transient do
user { FactoryBot.create(:user) }
content { nil }
end

after(:build) do |fs, evaluator|
fs.apply_depositor_metadata evaluator.user
end

factory :file_with_work do
after(:build) do |file, _evaluator|
file.title = ['testfile']
end
after(:create) do |file, evaluator|
Hydra::Works::UploadFileToFileSet.call(file, evaluator.content) if evaluator.content
work = create(:generic_work, user: evaluator.user)
work.members << file
work.save!
end
end
end
end
Binary file added spec/fixtures/latex.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,23 @@
require 'spec_helper'

RSpec.describe IiifPrint::SplitPdfs::AdventistPagesToJpgsSplitter do
describe '.split_this?' do
subject { described_class.split_this?(path: path) }

[
["hello.jpg", true],
["hello.reader.pdf", false],
["hello.reader.jpg", true],
["hello.reader.pdf.pdf", true]
].each do |given_path, expected_value|
context "given #{given_path.inspect}" do
let(:path) { given_path }

it { is_expected.to eq(expected_value) }
end
end
end

describe '.call' do
subject { described_class.call(path, suffixes: ["spec.rb"], file_set: create(:file_set)) }

Expand Down
32 changes: 32 additions & 0 deletions spec/jobs/file_sets_reprocess_job_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# frozen_string_literal: true

require 'spec_helper'

RSpec.describe FileSetsReprocessJob, clean: true do
let(:user) { FactoryBot.create(:user) }
let(:file_set) { FactoryBot.create(:file_with_work, content: file_content, user: user, label: 'latex.pdf') }
let(:file_content) { File.open(fixture_path + '/latex.pdf') }

describe '#perform' do
it 'submits jobs' do
expect(described_class::ConditionallyResplitFileSetJob).to receive(:perform_later).with(file_set_id: file_set.id)
file_set

described_class.perform_now
# Verifying that we found one record to consider resplitting.
end
end

describe 'ConditionallyResplitFileSetJob#perform' do
describe '#perform' do
it 'submits IiifPrint::Jobs::RequestSplitPdfJob' do
file_set

expect(IiifPrint::Jobs::RequestSplitPdfJob)
.to receive(:perform_later)
.with(file_set: file_set, user: User.batch_user)
FileSetsReprocessJob::ConditionallyResplitFileSetJob.perform_now(file_set_id: file_set.id)
end
end
end
end

0 comments on commit 8a052bd

Please sign in to comment.