Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions app/helpers/file_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,16 @@ def accept_file(file, name, kind)
msg: msg
}
end

# Sanitize the PDF
sanitized_result = sanitize_pdf(file["tempfile"].path)
unless sanitized_result[:success]
logger.debug "PDF sanitization failed: #{sanitized_result[:msg]}"
return { accepted: false, msg: sanitized_result[:msg] }
end

# Replace the original file with the sanitized version
FileUtils.mv(sanitized_result[:sanitized_path], file["tempfile"].path)
end

logger.debug "Uploaded file is accepted"
Expand Down Expand Up @@ -398,6 +408,53 @@ def validate_pdf(filename)
{ valid: true, encrypted: false }
end

#
# Sanitize a PDF file
#
def sanitize_pdf(input_path, output_path = nil)
return { success: false, msg: 'File does not exist' } unless File.exist?(input_path)

output_path ||= File.join(Dir.tmpdir, "sanitized-#{File.basename(input_path)}")

begin
logger.debug "Starting PDF sanitization for #{input_path}"

# Step 1: Validate the PDF
logger.debug "Validating PDF: #{input_path}"
validation_result = validate_pdf(input_path)
unless validation_result[:valid]
return { success: false, msg: 'Invalid or corrupted PDF' }
end

# Step 2: Use qpdf to sanitize the PDF
logger.debug "Running qpdf on: #{input_path}"
qpdf(input_path)

# Step 3: Further sanitize using ghostscript
sanitized_tmp = File.join(Dir.tmpdir, "gs-sanitized-#{File.basename(input_path)}")
logger.debug "Running ghostscript on: #{input_path}"
exec = "gs -sDEVICE=pdfwrite -dDetectDuplicateImages=true -dPDFSETTINGS=/printer -dNOPAUSE -dBATCH -dQUIET -sOutputFile=\"#{sanitized_tmp}\" \"#{input_path}\""
TimeoutHelper.system_try_within(30, "Ghostscript sanitization timeout", exec)

# Replace the output file with the ghostscript-sanitized version if successful
if File.exist?(sanitized_tmp)
FileUtils.mv(sanitized_tmp, output_path)
end

# Step 4: Validate the sanitized PDF
if File.exist?(output_path) && validate_pdf(output_path)[:valid]
logger.debug "Sanitization complete for #{input_path}"
return { success: true, sanitized_path: output_path }
else
return { success: false, msg: 'Failed to sanitize PDF' }
end
rescue => e
logger.error "Failed to sanitize PDF #{input_path}. Error: #{e.message}"
logger.error "Backtrace: #{e.backtrace.join("\n")}"
return { success: false, msg: "Error during sanitization: #{e.message}" }
end
end

#
# Copy a PDF into place
#
Expand Down Expand Up @@ -636,6 +693,7 @@ def line_wrap(path, width: 160)
module_function :qpdf
module_function :move_files
module_function :validate_pdf
module_function :sanitize_pdf
module_function :copy_pdf
module_function :read_file_to_str
module_function :path_to_plagarism_html
Expand Down
19 changes: 19 additions & 0 deletions test/helpers/file_helper_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
require 'test_helper'

class FileHelperTest < ActiveSupport::TestCase
test 'sanitize_pdf should sanitize a valid PDF' do
input_path = 'test/fixtures/files/valid.pdf'
output_path = File.join(Dir.tmpdir, 'sanitized-valid.pdf')

result = FileHelper.sanitize_pdf(input_path, output_path)
assert result[:success], "Expected sanitization to succeed, but got: #{result[:msg]}"
assert File.exist?(result[:sanitized_path]), 'Sanitized file does not exist'
end

test 'sanitize_pdf should fail for an invalid PDF' do
input_path = 'test/fixtures/files/invalid.pdf'

result = FileHelper.sanitize_pdf(input_path)
assert_not result[:success], 'Expected sanitization to fail for invalid PDF'
end
end