Skip to content

Commit

Permalink
add feature to skip OCR on PDF files with text
Browse files Browse the repository at this point in the history
Signed-off-by: Florian Freund <florian@freund.zone>
  • Loading branch information
youduda committed Aug 1, 2023
1 parent e1405e4 commit 5ca4337
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 16 deletions.
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
},
"require": {
"thiagoalessio/tesseract_ocr": "2.12.0",
"spatie/pdf-to-image": "2.2.0"
"spatie/pdf-to-image": "2.2.0",
"spatie/pdf-to-text": "1.52.0"
}
}
120 changes: 119 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions js/admin.elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ var fts_tesseract_elements = {
tesseract_lang: null,
tesseract_pdf: null,
tesseract_pdf_limit: null,
tesseract_pdf_skip_text: null,

init: function () {
fts_tesseract_elements.tesseract_div = $('#files_ocr-tesseract');
Expand All @@ -44,12 +45,14 @@ var fts_tesseract_elements = {
fts_tesseract_elements.tesseract_ocr = $('#tesseract_ocr');
fts_tesseract_elements.tesseract_pdf = $('#tesseract_pdf');
fts_tesseract_elements.tesseract_pdf_limit = $('#tesseract_pdf_limit');
fts_tesseract_elements.tesseract_pdf_skip_text = $('#tesseract_pdf_skip_text');

fts_tesseract_elements.tesseract_ocr.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_psm.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_lang.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_pdf.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_pdf_limit.on('change', fts_tesseract_elements.updateSettings);
fts_tesseract_elements.tesseract_pdf_skip_text.on('change', fts_tesseract_elements.updateSettings);
},


Expand Down
4 changes: 3 additions & 1 deletion js/admin.settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ var fts_tesseract_settings = {
fts_tesseract_elements.tesseract_lang.val(result.tesseract_lang);
fts_tesseract_elements.tesseract_pdf.prop('checked', (result.tesseract_pdf === '1'));
fts_tesseract_elements.tesseract_pdf_limit.val(result.tesseract_pdf_limit);
fts_tesseract_elements.tesseract_pdf_skip_text.prop('checked', (result.tesseract_pdf_skip_text === '1'));

fts_admin_settings.tagSettingsAsSaved(fts_tesseract_elements.tesseract_div);

Expand All @@ -73,7 +74,8 @@ var fts_tesseract_settings = {
tesseract_psm: fts_tesseract_elements.tesseract_psm.val(),
tesseract_lang: fts_tesseract_elements.tesseract_lang.val(),
tesseract_pdf: (fts_tesseract_elements.tesseract_pdf.is(':checked')) ? 1 : 0,
tesseract_pdf_limit: fts_tesseract_elements.tesseract_pdf_limit.val()
tesseract_pdf_limit: fts_tesseract_elements.tesseract_pdf_limit.val(),
tesseract_pdf_skip_text: (fts_tesseract_elements.tesseract_pdf_skip_text.is(':checked')) ? 1 : 0
};

$.ajax({
Expand Down
25 changes: 14 additions & 11 deletions lib/Service/ConfigService.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,15 @@ class ConfigService {
const TESSERACT_LANG = 'tesseract_lang';
const TESSERACT_PDF = 'tesseract_pdf';
const TESSERACT_PDF_LIMIT = 'tesseract_pdf_limit';
const TESSERACT_PDF_SKIP_TEXT = 'tesseract_pdf_skip_text';

public $defaults = [
self::TESSERACT_ENABLED => '0',
self::TESSERACT_PSM => '4',
self::TESSERACT_LANG => 'eng',
self::TESSERACT_PDF => '0',
self::TESSERACT_PDF_LIMIT => '0'
self::TESSERACT_ENABLED => '0',
self::TESSERACT_PSM => '4',
self::TESSERACT_LANG => 'eng',
self::TESSERACT_PDF => '0',
self::TESSERACT_PDF_LIMIT => '0',
self::TESSERACT_PDF_SKIP_TEXT => '0',
];


Expand All @@ -78,12 +80,13 @@ public function onGetConfig(GenericEvent $e) {
$config = $e->getArgument('config');
$config['files_fulltextsearch_tesseract'] =
[
'version' => $this->getAppValue('installed_version'),
'enabled' => $this->getAppValue(self::TESSERACT_ENABLED),
'psm' => $this->getAppValue(self::TESSERACT_PSM),
'lang' => $this->getAppValue(self::TESSERACT_LANG),
'pdf' => $this->getAppValue(self::TESSERACT_PDF),
'pdf_limit' => $this->getAppValue(self::TESSERACT_PDF_LIMIT),
'version' => $this->getAppValue('installed_version'),
'enabled' => $this->getAppValue(self::TESSERACT_ENABLED),
'psm' => $this->getAppValue(self::TESSERACT_PSM),
'lang' => $this->getAppValue(self::TESSERACT_LANG),
'pdf' => $this->getAppValue(self::TESSERACT_PDF),
'pdf_limit' => $this->getAppValue(self::TESSERACT_PDF_LIMIT),
'pdf_skip_text' => $this->getAppValue(self::TESSERACT_PDF_SKIP_TEXT),
];
$e->setArgument('config', $config);
}
Expand Down
29 changes: 27 additions & 2 deletions lib/Service/TesseractService.php
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@
use OCP\FullTextSearch\Model\ISearchRequest;
use Psr\Log\LoggerInterface;
use Spatie\PdfToImage\Exceptions\PageDoesNotExist;
use Spatie\PdfToImage\Pdf;
use Spatie\PdfToImage\Pdf as PdfToImage_Pdf;
use Spatie\PdfToText\Pdf as PdfToText_Pdf;
use thiagoalessio\TesseractOCR\TesseractOCR;
use Throwable;



/**
* Class TesseractService
*
Expand Down Expand Up @@ -213,6 +215,22 @@ private function ocrFileFromPath(string $path): string {
}


/**
* @param string $path
*
* @return bool
*/
private function pdfContainsText(string $path): bool {
try {
$text = (new PdfToText_Pdf())->setPdf($path)->text();
return $text !== '';
} catch (Exception $e) {
$this->logger->notice('extracting text from PDF failed', ['exception' => $e, 'path' => $path]);
}
return false;
}


/**
* @param AFilesDocument $document
* @param File $file
Expand All @@ -234,7 +252,14 @@ private function ocrPdf(AFilesDocument $document, File $file): bool {
try {
$path = $this->getAbsolutePath($file);
$this->logger->debug('Absolute path', ['path' => $path]);
$pdf = new Pdf($path);

if ($this->configService->optionIsSelected(ConfigService::TESSERACT_PDF_SKIP_TEXT)
&& $this->pdfContainsText($path)) {
$this->logger->debug('PDF file contains text, skipping OCR');
return true;
}

$pdf = new PdfToImage_Pdf($path);
} catch (Exception $e) {
$this->logger->notice('failed to ocrPdf', ['exception' => $e, 'document' => $document]);
throw new NotFoundException();
Expand Down
11 changes: 11 additions & 0 deletions templates/settings.admin.php
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,17 @@
</div>
</div>

<div class="div-table-row tesseract_ocr_enabled">
<div class="div-table-col div-table-col-left">
<span class="leftcol">Skip OCR on PDF with text</span>
<br/>
<em>Only OCR PDF files without text (e.g. scans). Use the embedded text otherwise. pdftotext must be installed.</em>
</div>
<div class="div-table-col">
<input type="checkbox" id="tesseract_pdf_skip_text" value="1"/>
</div>
</div>

</div>


Expand Down

0 comments on commit 5ca4337

Please sign in to comment.