Skip to content

Commit

Permalink
Add Language Detection endpoint
Browse files Browse the repository at this point in the history
Close #247

Test
  • Loading branch information
tagliala committed Sep 17, 2024
1 parent 6a5e963 commit d48a68a
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 0 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,28 @@ Response:

... PDF document body ...

### Detect language

This is a foreground document language detection request. The detected language
will be returned as the response body.

POST /detect-language

Params *(suggest using `multipart/form-data`)*:

* `file` - the file to convert

#### Example:

POST /detect-language
file=... foo.docx ...

Response:

Content-Type: text/plain

en

## Callbacks

When a document conversion is completed, an attempt will be made to POST a
Expand Down
14 changes: 14 additions & 0 deletions config/tika.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<parsers>
<!-- https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454096#TikaOCR-OverridingDefaultConfiguration -->
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
</parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="language" type="string">eng+ara+spa+fra+por</param>
</params>
</parser>
</parsers>
</properties>
24 changes: 24 additions & 0 deletions lib/app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,30 @@ class App < Sinatra::Base
respond_with_error e
end

#
# Detect document language
#
# POST params:
# file - the file to detect language
post '/detect-language' do
begin
unless params[:file]
return respond 400, "missing file parameter"
end

unless params[:file].respond_to?(:fetch) and params[:file].fetch(:tempfile, nil).respond_to?(:read)
return respond 400, "invalid file parameter"
end

body = params[:file][:tempfile].read
content = Converter.new(logger: @logger).convert_file('detect-language', body)
content_type content.mime_type
content
rescue StandardError => e
respond_with_error e
end
end

# Legacy method to convert files
# Brought over from Heathen
#
Expand Down
19 changes: 19 additions & 0 deletions lib/heathen/processor_methods/detect_language.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

module Heathen
class Processor
def detect_language
expect_mime_type 'application/pdf'

executioner.execute(
Colore::C_.tika_path,
"--config=#{File.expand_path('../../config/tika.yml', __dir__)}",
'--language',
job.content_file,
binary: true
)

job.content = executioner.stdout
end
end
end
4 changes: 4 additions & 0 deletions lib/heathen/task.rb
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,7 @@ def task_key(action, mime_type)
Heathen::Task.register 'doc', '.*' do
perform_task 'msoffice'
end

Heathen::Task.register 'detect-language', '.*' do
perform_task 'detect_language'
end
20 changes: 20 additions & 0 deletions spec/heathen/processor_methods/detect_language_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# frozen_string_literal: true

require 'spec_helper'

describe Heathen::Processor do
let(:content) { File.read(fixture('heathen/quickfox.pdf')) }
let(:job) { Heathen::Job.new 'foo', content, 'en' }
let(:processor) { described_class.new job: job, logger: Logger.new($stderr) }

after do
processor.clean_up
end

context '#detect_language' do
it 'detects input file language' do
processor.detect_language
expect(job.content).to eq 'en'
end
end
end

0 comments on commit d48a68a

Please sign in to comment.