Skip to content

Commit

Permalink
Adding pdftotext for PDF support in Hypercube (#77)
Browse files Browse the repository at this point in the history
  • Loading branch information
dannylamb authored and seth-shaw-unlv committed Sep 18, 2019
1 parent ff58f34 commit 4173b13
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 14 deletions.
5 changes: 3 additions & 2 deletions Hypercube/cfg/config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

hypercube:
# path to the convert executable
executable: tesseract
tesseract_executable: tesseract
pdftotext_executable: pdftotext

fedora_resource:
base_url: http://localhost:8080/fcrepo/rest
Expand All @@ -20,4 +21,4 @@ syn:
# Path to the syn config file for authentication.
# example can be found here:
# https://github.com/Islandora-CLAW/Syn/blob/master/conf/syn-settings.example.xml
config: ../syn-settings.xml
config: ../syn-settings.xml
44 changes: 37 additions & 7 deletions Hypercube/src/Controller/HypercubeController.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use GuzzleHttp\Psr7\StreamWrapper;
use Islandora\Crayfish\Commons\CmdExecuteService;
use Monolog\Logger;
use Psr\Http\Message\ResponseInterface;
use Symfony\Component\HttpFoundation\BinaryFileResponse;
use Symfony\Component\HttpFoundation\Request;
Expand All @@ -25,17 +26,35 @@ class HypercubeController
/**
* @var string
*/
protected $executable;
protected $tesseract_executable;

/**
* @var string
*/
protected $pdftotext_executable;

/**
* @var \Monolog\Logger
*/
protected $log;

/**
* HypercubeController constructor.
* @param \Islandora\Crayfish\Commons\CmdExecuteService $cmd
* @param string $executable
* @param string $tesseract_executable
* @param string $pdftotext_executable
* @param $log
*/
public function __construct(CmdExecuteService $cmd, $executable)
{
public function __construct(
CmdExecuteService $cmd,
$tesseract_executable,
$pdftotext_executable,
Logger $log
) {
$this->cmd = $cmd;
$this->executable = $executable;
$this->tesseract_executable = $tesseract_executable;
$this->pdftotext_executable = $pdftotext_executable;
$this->log = $log;
}

/**
Expand All @@ -50,10 +69,21 @@ public function get(Request $request)
// Get tiff as a resource.
$body = StreamWrapper::getResource($fedora_resource->getBody());

// Arguments to OCR command are sent as a custom header
// Arguments to command line are sent as a custom header
$args = $request->headers->get('X-Islandora-Args');

$cmd_string = $this->executable . ' stdin stdout ' . $args;
// Check content type and use the appropriate command line tool.
$content_type = $fedora_resource->getHeader('Content-Type')[0];

$this->log->debug("Got Content-Type:", ['type' => $content_type]);

if ($content_type == 'application/pdf') {
$cmd_string = $this->pdftotext_executable . " $args - -";
} else {
$cmd_string = $this->tesseract_executable . " stdin stdout $args";
}

$this->log->debug("Executing command:", ['cmd' => $cmd_string]);

// Return response.
try {
Expand Down
4 changes: 3 additions & 1 deletion Hypercube/src/app.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
$app['hypercube.controller'] = function ($app) {
return new HypercubeController(
$app['crayfish.cmd_execute_service'],
$app['crayfish.hypercube.executable']
$app['crayfish.hypercube.tesseract_executable'],
$app['crayfish.hypercube.pdftotext_executable'],
$app['monolog']
);
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use Islandora\Crayfish\Commons\CmdExecuteService;
use Islandora\Hypercube\Controller\HypercubeController;
use Monolog\Logger;
use Prophecy\Argument;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\StreamInterface;
Expand All @@ -21,9 +22,12 @@ class HypercubeControllerTest extends \PHPUnit_Framework_TestCase
public function testOptions()
{
$mock_service = $this->prophesize(CmdExecuteService::class)->reveal();
$mock_logger = $this->prophesize(Logger::class)->reveal();
$controller = new HypercubeController(
$mock_service,
''
'tesseract',
'pdftotext',
$mock_logger
);

$response = $controller->options();
Expand All @@ -39,12 +43,27 @@ public function testOptions()
* @covers ::get
*/
public function testTesseractErrorReturns500()
{
$this->errorReturns500('image/tiff');
}

/**
* @covers ::__construct
* @covers ::get
*/
public function testPdfToTextErrorReturns500()
{
$this->errorReturns500('application/pdf');
}

protected function errorReturns500($mimetype)
{
// Mock a TesseractService to create a controller.
$prophecy = $this->prophesize(CmdExecuteService::class);
$prophecy->execute(Argument::any(), Argument::any())->willThrow(new \RuntimeException("ERROR", 500));
$mock_service = $prophecy->reveal();
$controller = new HypercubeController($mock_service, '');
$mock_logger = $this->prophesize(Logger::class)->reveal();
$controller = new HypercubeController($mock_service, 'tesseract', 'pdftotext', $mock_logger);

// Mock a stream body for a Fedora response.
$prophecy = $this->prophesize(StreamInterface::class);
Expand All @@ -54,6 +73,7 @@ public function testTesseractErrorReturns500()

// Mock a Fedora response.
$prophecy = $this->prophesize(ResponseInterface::class);
$prophecy->getHeader('Content-Type')->willReturn(['image/tiff']);
$prophecy->getStatusCode()->willReturn(200);
$prophecy->getBody()->willReturn($mock_stream);
$mock_fedora_response = $prophecy->reveal();
Expand All @@ -78,10 +98,25 @@ public function testTesseractErrorReturns500()
*/
public function testTesseractSuccessReturns200()
{
// Mock a TesseractService to create a controller.
$this->successReturns200('image/tiff');
}

/**
* @covers ::__construct
* @covers ::get
*/
public function testPdfToTextSuccessReturns200()
{
$this->successReturns200('application/pdf');
}

protected function successReturns200($mimetype)
{
// Mock a controller.
$prophecy = $this->prophesize(CmdExecuteService::class);
$mock_service = $prophecy->reveal();
$controller = new HypercubeController($mock_service, '');
$mock_logger = $this->prophesize(Logger::class)->reveal();
$controller = new HypercubeController($mock_service, 'tesseract', 'pdftotext', $mock_logger);

// Mock a stream body for a Fedora response.
$prophecy = $this->prophesize(StreamInterface::class);
Expand All @@ -91,6 +126,7 @@ public function testTesseractSuccessReturns200()

// Mock a Fedora response.
$prophecy = $this->prophesize(ResponseInterface::class);
$prophecy->getHeader('Content-Type')->willReturn([$mimetype]);
$prophecy->getStatusCode()->willReturn(200);
$prophecy->getBody()->willReturn($mock_stream);
$mock_fedora_response = $prophecy->reveal();
Expand All @@ -104,6 +140,7 @@ public function testTesseractSuccessReturns200()
$request->headers->set('ApixLdpResource', 'http://localhost:8080/fcrepo/rest/foo');
$request->attributes->set('fedora_resource', $mock_fedora_response);

// Check success.
$response = $controller->get($request);
$this->assertTrue($response->getStatusCode() == 200, "Response must return 200");
}
Expand Down

0 comments on commit 4173b13

Please sign in to comment.