diff --git a/config/app.yml b/config/app.yml index 0471152..8ad03fc 100644 --- a/config/app.yml +++ b/config/app.yml @@ -29,4 +29,7 @@ libreoffice_path: <%= ENV['LIBREOFFICE_PATH'] %> tesseract_path: <%= ENV['TESSERACT_PATH'] %> tika_path: <%= ENV['TIKA_PATH'] %> wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %> + +# Other settings +tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %> wkhtmltopdf_params: '-d 100 --encoding UTF-8' diff --git a/lib/colore.rb b/lib/colore.rb index 9b50db9..ae1b295 100644 --- a/lib/colore.rb +++ b/lib/colore.rb @@ -9,3 +9,4 @@ require_relative 'document' require_relative 'heathen' require_relative 'sidekiq_workers' +require_relative 'tika_config' diff --git a/lib/config.rb b/lib/config.rb index c034cca..e94d90b 100644 --- a/lib/config.rb +++ b/lib/config.rb @@ -41,6 +41,8 @@ class C_ attr_accessor :tika_path # @return [String] Path to the wkhtmltopdf binary. Defaults to `"wkhtmltopdf"` attr_accessor :wkhtmltopdf_path + # @return [String] Relative path to the writable tika config directory. Defaults to `"../tmp/tika"` + attr_accessor :tika_config_directory # @return [String] Params for wkhtmltopdf attr_accessor :wkhtmltopdf_params @@ -65,6 +67,8 @@ def self.config c.tesseract_path = yaml['tesseract_path'] || 'tesseract' c.tika_path = yaml['tika_path'] || 'tika' c.wkhtmltopdf_path = yaml['wkhtmltopdf_path'] || 'wkhtmltopdf' + + c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika' c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || '' c diff --git a/lib/heathen/processor_methods/libreoffice.rb b/lib/heathen/processor_methods/libreoffice.rb index f76f997..a983f59 100644 --- a/lib/heathen/processor_methods/libreoffice.rb +++ b/lib/heathen/processor_methods/libreoffice.rb @@ -55,6 +55,7 @@ def libreoffice(format:) if to_suffix == 'txt' executioner.execute( Colore::C_.tika_path, + "--config=#{Colore::TikaConfig.path_for(job.language)}", '--text', job.content_file, binary: true diff --git a/lib/heathen/processor_methods/pdftotext.rb b/lib/heathen/processor_methods/pdftotext.rb index d1dca7b..54a7c8c 100644 --- a/lib/heathen/processor_methods/pdftotext.rb +++ b/lib/heathen/processor_methods/pdftotext.rb @@ -7,6 +7,7 @@ def pdftotext executioner.execute( Colore::C_.tika_path, + "--config=#{Colore::TikaConfig.path_for(job.language)}", '--text', job.content_file, binary: true diff --git a/lib/tika_config.rb b/lib/tika_config.rb new file mode 100644 index 0000000..f6d2156 --- /dev/null +++ b/lib/tika_config.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +require 'fileutils' +require 'pathname' + +module Colore + # The Colore Tika is a module to help with Tika-related configuration files. + module TikaConfig + # The configuration template version + VERSION = 'v1' + + # The default language to use when the language has not been found + DEFAULT_LANGUAGE = 'eng' + + # Config template + TEMPLATE = <<~XML + + + + + + + %s + + + + + XML + + class << self + private + + def tika_config_path + Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__) + end + + def path_for!(language_alpha3) + file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml") + return file if file.file? + + FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION)) + File.write(file, format(TEMPLATE, language_alpha3: language_alpha3)) + file + end + end + + # Returns the file path of the Tika configuration for performing OCR + # detection in a specified language. + # + # @param [String] language The language code in either ISO 639-1 (two-letter) or ISO 639-2 (three-letter) format. + # Supported languages are those with corresponding Tika configuration files. + # + # @return [Pathname] The path to the Tika configuration file for the specified language or + # the configuration file for DEFAULT_LANGUAGE if the language is not found. + def self.path_for(language) + language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE + + path_for!(language_alpha3) + end + end +end diff --git a/spec/fixtures/heathen/quickfox.ar.jpg b/spec/fixtures/heathen/quickfox.ar.jpg new file mode 100644 index 0000000..bb69b24 Binary files /dev/null and b/spec/fixtures/heathen/quickfox.ar.jpg differ diff --git a/spec/fixtures/heathen/quickfox.ar.pdf b/spec/fixtures/heathen/quickfox.ar.pdf new file mode 100644 index 0000000..9c613b0 Binary files /dev/null and b/spec/fixtures/heathen/quickfox.ar.pdf differ diff --git a/spec/fixtures/heathen/quickfox.ar.txt b/spec/fixtures/heathen/quickfox.ar.txt new file mode 100644 index 0000000..36b2c79 --- /dev/null +++ b/spec/fixtures/heathen/quickfox.ar.txt @@ -0,0 +1 @@ +الثعلب البني السريع مفتون بالكلاب الكسولة diff --git a/spec/heathen/processor_methods/pdftotext_spec.rb b/spec/heathen/processor_methods/pdftotext_spec.rb index a48484d..8895141 100644 --- a/spec/heathen/processor_methods/pdftotext_spec.rb +++ b/spec/heathen/processor_methods/pdftotext_spec.rb @@ -4,7 +4,8 @@ RSpec.describe Heathen::Processor do let(:content) { fixture('heathen/quickfox.pdf').read } - let(:job) { Heathen::Job.new 'foo', content, 'en' } + let(:job) { Heathen::Job.new 'foo', content, language } + let(:language) { 'en' } let(:processor) { described_class.new job: job, logger: spec_logger } after do @@ -14,7 +15,19 @@ describe '#pdftotext' do it 'converts PDF to TXT' do processor.pdftotext + expect(job.content).to eq 'The quick brown fox jumps lazily over the dog' expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii' end + + context 'with Arabic files' do + let(:content) { fixture('heathen/quickfox.ar.pdf').read } + let(:language) { 'ar' } + + it 'extracts Arabic text from images' do + processor.pdftotext + expect(job.content).to eq fixture('heathen/quickfox.ar.txt').read.strip.force_encoding(Encoding::ASCII_8BIT) + expect(job.content.mime_type).to eq 'text/plain; charset=utf-8' + end + end end end diff --git a/spec/lib/tika_config_spec.rb b/spec/lib/tika_config_spec.rb new file mode 100644 index 0000000..e2bf234 --- /dev/null +++ b/spec/lib/tika_config_spec.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'fileutils' +require 'pathname' + +RSpec.describe Colore::TikaConfig do + let(:tika_config_directory) { '../tmp/tika-test' } + let(:tika_test_config_path) { Pathname.new(File.expand_path('../../tmp/tika-test', __dir__)) } + + before do + allow(Colore::C_.config).to receive(:tika_config_directory).and_return tika_config_directory + FileUtils.mkdir_p tika_test_config_path + FileUtils.rm_rf tika_test_config_path + end + + after do + FileUtils.rm_rf tika_test_config_path + end + + describe '.path_for' do + subject(:path_for) { described_class.path_for(language) } + + context 'when the language is found' do + let(:language) { 'fr' } + + before do + allow(Colore::Utils).to receive(:language_alpha3).with('fr').and_return('fra') + end + + it 'returns the correct configuration file path' do + expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, 'tika.fra.xml') + end + end + + context 'when the language is not found' do + let(:language) { 'unknown' } + + it 'returns the default configuration file path' do + expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, "tika.#{described_class::DEFAULT_LANGUAGE}.xml") + end + end + + context 'when the configuration file is already present' do + let(:language) { 'en' } + + before do + allow(File).to receive(:write) + .with(tika_test_config_path.join('ocr', described_class::VERSION, 'tika.eng.xml'), an_instance_of(String)) + .and_call_original + end + + it 'does not overwrite it' do + 2.times { described_class.path_for(language) } + expect(File).to have_received(:write).once + end + end + end +end