diff --git a/composer.json b/composer.json index 0e54e38..f0dcc1a 100644 --- a/composer.json +++ b/composer.json @@ -21,6 +21,7 @@ }, "require": { "thiagoalessio/tesseract_ocr": "2.12.0", - "spatie/pdf-to-image": "2.2.0" + "spatie/pdf-to-image": "2.2.0", + "spatie/pdf-to-text": "1.52.0" } } diff --git a/composer.lock b/composer.lock index c331a25..87162b1 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "bd97cd5f1ff1cb9895d6fcccd5ed8a95", + "content-hash": "f4814f085f2fadd66c115ba063adba3a", "packages": [ { "name": "spatie/pdf-to-image", @@ -66,6 +66,124 @@ ], "time": "2022-03-08T07:52:26+00:00" }, + { + "name": "spatie/pdf-to-text", + "version": "1.52.0", + "source": { + "type": "git", + "url": "https://github.com/spatie/pdf-to-text.git", + "reference": "3f1033a5fb25c8b4d2f7bc72d2120fac3723bf5c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spatie/pdf-to-text/zipball/3f1033a5fb25c8b4d2f7bc72d2120fac3723bf5c", + "reference": "3f1033a5fb25c8b4d2f7bc72d2120fac3723bf5c", + "shasum": "" + }, + "require": { + "php": "^7.4|^8.0", + "symfony/process": "^4.0|^5.0|^6.0" + }, + "require-dev": { + "phpunit/phpunit": "^9.5" + }, + "type": "library", + "autoload": { + "psr-4": { + "Spatie\\PdfToText\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Freek Van der Herten", + "email": "freek@spatie.be", + "homepage": "https://spatie.be", + "role": "Developer" + } + ], + "description": "Extract text from a pdf", + "homepage": "https://github.com/spatie/pdf-to-text", + "keywords": [ + "pdf-to-text", + "spatie" + ], + "support": { + "issues": "https://github.com/spatie/pdf-to-text/issues", + "source": "https://github.com/spatie/pdf-to-text/tree/1.52.0" + }, + "funding": [ + { + "url": "https://spatie.be/open-source/support-us", + "type": "custom" + } + ], + "time": "2022-07-14T20:49:11+00:00" + }, + { + "name": "symfony/process", + "version": "v6.3.2", + "source": { + "type": "git", + "url": "https://github.com/symfony/process.git", + "reference": "c5ce962db0d9b6e80247ca5eb9af6472bd4d7b5d" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/process/zipball/c5ce962db0d9b6e80247ca5eb9af6472bd4d7b5d", + "reference": "c5ce962db0d9b6e80247ca5eb9af6472bd4d7b5d", + "shasum": "" + }, + "require": { + "php": ">=8.1" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\Process\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Executes commands in sub-processes", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/process/tree/v6.3.2" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2023-07-12T16:00:22+00:00" + }, { "name": "thiagoalessio/tesseract_ocr", "version": "2.12.0", diff --git a/js/admin.elements.js b/js/admin.elements.js index b843357..7429197 100644 --- a/js/admin.elements.js +++ b/js/admin.elements.js @@ -36,6 +36,7 @@ var fts_tesseract_elements = { tesseract_lang: null, tesseract_pdf: null, tesseract_pdf_limit: null, + tesseract_pdf_skip_text: null, init: function () { fts_tesseract_elements.tesseract_div = $('#files_ocr-tesseract'); @@ -44,12 +45,14 @@ var fts_tesseract_elements = { fts_tesseract_elements.tesseract_ocr = $('#tesseract_ocr'); fts_tesseract_elements.tesseract_pdf = $('#tesseract_pdf'); fts_tesseract_elements.tesseract_pdf_limit = $('#tesseract_pdf_limit'); + fts_tesseract_elements.tesseract_pdf_skip_text = $('#tesseract_pdf_skip_text'); fts_tesseract_elements.tesseract_ocr.on('change', fts_tesseract_elements.updateSettings); fts_tesseract_elements.tesseract_psm.on('change', fts_tesseract_elements.updateSettings); fts_tesseract_elements.tesseract_lang.on('change', fts_tesseract_elements.updateSettings); fts_tesseract_elements.tesseract_pdf.on('change', fts_tesseract_elements.updateSettings); fts_tesseract_elements.tesseract_pdf_limit.on('change', fts_tesseract_elements.updateSettings); + fts_tesseract_elements.tesseract_pdf_skip_text.on('change', fts_tesseract_elements.updateSettings); }, diff --git a/js/admin.settings.js b/js/admin.settings.js index 618fb2b..e8b9a64 100644 --- a/js/admin.settings.js +++ b/js/admin.settings.js @@ -51,6 +51,7 @@ var fts_tesseract_settings = { fts_tesseract_elements.tesseract_lang.val(result.tesseract_lang); fts_tesseract_elements.tesseract_pdf.prop('checked', (result.tesseract_pdf === '1')); fts_tesseract_elements.tesseract_pdf_limit.val(result.tesseract_pdf_limit); + fts_tesseract_elements.tesseract_pdf_skip_text.prop('checked', (result.tesseract_pdf_skip_text === '1')); fts_admin_settings.tagSettingsAsSaved(fts_tesseract_elements.tesseract_div); @@ -73,7 +74,8 @@ var fts_tesseract_settings = { tesseract_psm: fts_tesseract_elements.tesseract_psm.val(), tesseract_lang: fts_tesseract_elements.tesseract_lang.val(), tesseract_pdf: (fts_tesseract_elements.tesseract_pdf.is(':checked')) ? 1 : 0, - tesseract_pdf_limit: fts_tesseract_elements.tesseract_pdf_limit.val() + tesseract_pdf_limit: fts_tesseract_elements.tesseract_pdf_limit.val(), + tesseract_pdf_skip_text: (fts_tesseract_elements.tesseract_pdf_skip_text.is(':checked')) ? 1 : 0 }; $.ajax({ diff --git a/lib/Service/ConfigService.php b/lib/Service/ConfigService.php index 821017d..b2af544 100644 --- a/lib/Service/ConfigService.php +++ b/lib/Service/ConfigService.php @@ -49,13 +49,15 @@ class ConfigService { const TESSERACT_LANG = 'tesseract_lang'; const TESSERACT_PDF = 'tesseract_pdf'; const TESSERACT_PDF_LIMIT = 'tesseract_pdf_limit'; + const TESSERACT_PDF_SKIP_TEXT = 'tesseract_pdf_skip_text'; public $defaults = [ - self::TESSERACT_ENABLED => '0', - self::TESSERACT_PSM => '4', - self::TESSERACT_LANG => 'eng', - self::TESSERACT_PDF => '0', - self::TESSERACT_PDF_LIMIT => '0' + self::TESSERACT_ENABLED => '0', + self::TESSERACT_PSM => '4', + self::TESSERACT_LANG => 'eng', + self::TESSERACT_PDF => '0', + self::TESSERACT_PDF_LIMIT => '0', + self::TESSERACT_PDF_SKIP_TEXT => '0', ]; @@ -78,12 +80,13 @@ public function onGetConfig(GenericEvent $e) { $config = $e->getArgument('config'); $config['files_fulltextsearch_tesseract'] = [ - 'version' => $this->getAppValue('installed_version'), - 'enabled' => $this->getAppValue(self::TESSERACT_ENABLED), - 'psm' => $this->getAppValue(self::TESSERACT_PSM), - 'lang' => $this->getAppValue(self::TESSERACT_LANG), - 'pdf' => $this->getAppValue(self::TESSERACT_PDF), - 'pdf_limit' => $this->getAppValue(self::TESSERACT_PDF_LIMIT), + 'version' => $this->getAppValue('installed_version'), + 'enabled' => $this->getAppValue(self::TESSERACT_ENABLED), + 'psm' => $this->getAppValue(self::TESSERACT_PSM), + 'lang' => $this->getAppValue(self::TESSERACT_LANG), + 'pdf' => $this->getAppValue(self::TESSERACT_PDF), + 'pdf_limit' => $this->getAppValue(self::TESSERACT_PDF_LIMIT), + 'pdf_skip_text' => $this->getAppValue(self::TESSERACT_PDF_SKIP_TEXT), ]; $e->setArgument('config', $config); } diff --git a/lib/Service/TesseractService.php b/lib/Service/TesseractService.php index 16d49b6..e123fb6 100644 --- a/lib/Service/TesseractService.php +++ b/lib/Service/TesseractService.php @@ -42,11 +42,13 @@ use OCP\FullTextSearch\Model\ISearchRequest; use Psr\Log\LoggerInterface; use Spatie\PdfToImage\Exceptions\PageDoesNotExist; -use Spatie\PdfToImage\Pdf; +use Spatie\PdfToImage\Pdf as PdfToImage_Pdf; +use Spatie\PdfToText\Pdf as PdfToText_Pdf; use thiagoalessio\TesseractOCR\TesseractOCR; use Throwable; + /** * Class TesseractService * @@ -213,6 +215,22 @@ private function ocrFileFromPath(string $path): string { } + /** + * @param string $path + * + * @return bool + */ + private function pdfContainsText(string $path): bool { + try { + $text = (new PdfToText_Pdf())->setPdf($path)->text(); + return $text !== ''; + } catch (Exception $e) { + $this->logger->notice('extracting text from PDF failed', ['exception' => $e, 'path' => $path]); + } + return false; + } + + /** * @param AFilesDocument $document * @param File $file @@ -234,7 +252,14 @@ private function ocrPdf(AFilesDocument $document, File $file): bool { try { $path = $this->getAbsolutePath($file); $this->logger->debug('Absolute path', ['path' => $path]); - $pdf = new Pdf($path); + + if ($this->configService->optionIsSelected(ConfigService::TESSERACT_PDF_SKIP_TEXT) + && $this->pdfContainsText($path)) { + $this->logger->debug('PDF file contains text, skipping OCR'); + return true; + } + + $pdf = new PdfToImage_Pdf($path); } catch (Exception $e) { $this->logger->notice('failed to ocrPdf', ['exception' => $e, 'document' => $document]); throw new NotFoundException(); diff --git a/templates/settings.admin.php b/templates/settings.admin.php index f22e209..94c67bc 100644 --- a/templates/settings.admin.php +++ b/templates/settings.admin.php @@ -98,6 +98,17 @@ +
+
+ Skip OCR on PDF with text +
+ Only OCR PDF files without text (e.g. scans). Use the embedded text otherwise. pdftotext must be installed. +
+
+ +
+
+