diff --git a/composer.json b/composer.json index 7d89ffa..0681584 100644 --- a/composer.json +++ b/composer.json @@ -27,7 +27,9 @@ "symfony/serializer": "^7.1", "symfony/property-access": "^7.1", "webmozart/assert": "^1.11", - "league/commonmark": "^2.4" + "league/commonmark": "^2.4", + "smalot/pdfparser": "^2.10", + "symfony/filesystem": "^7.1" }, "require-dev": { "phpunit/phpunit": "^11.1", diff --git a/composer.lock b/composer.lock index 5fcee88..610d91e 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "1a1fc0bc0e899a1c9197cbd2c6d17d37", + "content-hash": "0cab03fcceac267b78fa89ff1ea15a6a", "packages": [ { "name": "dflydev/dot-access-data", @@ -570,6 +570,57 @@ }, "time": "2021-07-14T16:46:02+00:00" }, + { + "name": "smalot/pdfparser", + "version": "v2.10.0", + "source": { + "type": "git", + "url": "https://github.com/smalot/pdfparser.git", + "reference": "14adf318f8620a6195c0b00d51c6a507837b9ff4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/smalot/pdfparser/zipball/14adf318f8620a6195c0b00d51c6a507837b9ff4", + "reference": "14adf318f8620a6195c0b00d51c6a507837b9ff4", + "shasum": "" + }, + "require": { + "ext-iconv": "*", + "ext-zlib": "*", + "php": ">=7.1", + "symfony/polyfill-mbstring": "^1.18" + }, + "type": "library", + "autoload": { + "psr-0": { + "Smalot\\PdfParser\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Sebastien MALOT", + "email": "sebastien@malot.fr" + } + ], + "description": "Pdf parser library. Can read and extract information from pdf file.", + "homepage": "https://www.pdfparser.org", + "keywords": [ + "extract", + "parse", + "parser", + "pdf", + "text" + ], + "support": { + "issues": "https://github.com/smalot/pdfparser/issues", + "source": "https://github.com/smalot/pdfparser/tree/v2.10.0" + }, + "time": "2024-04-29T06:36:50+00:00" + }, { "name": "symfony/deprecation-contracts", "version": "v3.5.0", @@ -637,6 +688,72 @@ ], "time": "2024-04-18T09:32:20+00:00" }, + { + "name": "symfony/filesystem", + "version": "v7.1.2", + "source": { + "type": "git", + "url": "https://github.com/symfony/filesystem.git", + "reference": "92a91985250c251de9b947a14bb2c9390b1a562c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/filesystem/zipball/92a91985250c251de9b947a14bb2c9390b1a562c", + "reference": "92a91985250c251de9b947a14bb2c9390b1a562c", + "shasum": "" + }, + "require": { + "php": ">=8.2", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.8" + }, + "require-dev": { + "symfony/process": "^6.4|^7.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\Filesystem\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Provides basic utilities for the filesystem", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/filesystem/tree/v7.1.2" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-06-28T10:03:55+00:00" + }, { "name": "symfony/http-client", "version": "v7.1.1", diff --git a/src/Retrieval/Loader/DirectoryLoader.php b/src/Retrieval/Loader/Directory/DirectoryLoader.php similarity index 50% rename from src/Retrieval/Loader/DirectoryLoader.php rename to src/Retrieval/Loader/Directory/DirectoryLoader.php index ddad6fa..734ba28 100644 --- a/src/Retrieval/Loader/DirectoryLoader.php +++ b/src/Retrieval/Loader/Directory/DirectoryLoader.php @@ -11,9 +11,11 @@ declare(strict_types=1); -namespace Devscast\Lugha\Retrieval\Loader; +namespace Devscast\Lugha\Retrieval\Loader\Directory; use Devscast\Lugha\Retrieval\Document; +use Devscast\Lugha\Retrieval\Loader\LoaderInterface; +use Devscast\Lugha\Retrieval\Metadata; use Devscast\Lugha\Retrieval\Splitter\SplitterInterface; /** @@ -25,24 +27,38 @@ readonly class DirectoryLoader implements LoaderInterface { public function __construct( - public string $directory, - public ?string $glob = null + public string $path ) { } /** * @return iterable */ + #[\Override] public function load(): iterable { - return []; + /** @var RecursiveDirectoryIterator|\DirectoryIterator $file */ + foreach (new WildcardDirectoryIterator($this->path) as $file) { + if ($file->isFile()) { + yield new Document( + content: (string) file_get_contents($file->getPathname()), + metadata: new Metadata( + sourceType: 'file', + sourceName: $file->getFilename(), + ), + ); + } + } } /** * @return iterable */ + #[\Override] public function loadAndSplit(SplitterInterface $splitter): iterable { - return []; + foreach ($this->load() as $document) { + yield from $splitter->createDocuments($document); + } } } diff --git a/src/Retrieval/Loader/Directory/RecursiveDirectoryIterator.php b/src/Retrieval/Loader/Directory/RecursiveDirectoryIterator.php new file mode 100644 index 0000000..2114bf7 --- /dev/null +++ b/src/Retrieval/Loader/Directory/RecursiveDirectoryIterator.php @@ -0,0 +1,33 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Directory; + +use RecursiveDirectoryIterator as NativeRecursiveDirectoryIterator; +use RecursiveIteratorIterator; + +/** + * Class RealRecursiveDirectoryIterator. + * + * @extends RecursiveIteratorIterator + * @see https://www.php.net/manual/en/class.recursivedirectoryiterator.php + * + * @author bernard-ng + */ +final class RecursiveDirectoryIterator extends RecursiveIteratorIterator +{ + public function __construct(string $path) + { + parent::__construct(new NativeRecursiveDirectoryIterator($path)); + } +} diff --git a/src/Retrieval/Loader/Directory/WildcardDirectoryIterator.php b/src/Retrieval/Loader/Directory/WildcardDirectoryIterator.php new file mode 100644 index 0000000..69b0b6d --- /dev/null +++ b/src/Retrieval/Loader/Directory/WildcardDirectoryIterator.php @@ -0,0 +1,62 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Directory; + +use DirectoryIterator; +use FilterIterator; + +/** + * Class WildcardDirectoryIterator. + * + * @extends FilterIterator + * + * @author bernard-ng + */ +final class WildcardDirectoryIterator extends FilterIterator +{ + private string $regex; + + public function __construct(string $path) + { + $recursive = false; + + if (str_starts_with($path, '-R ')) { + $recursive = true; + $path = substr($path, 3); + } + + if (preg_match('~/?([^/]*\*[^/]*)$~', $path, $matches)) { // matched wildcards in filename + $path = substr($path, 0, -strlen($matches[1]) - 1); // strip wildcards part from path + $this->regex = '~^' . str_replace('*', '.*', str_replace('.', '\.', $matches[1])) . '$~'; // convert wildcards to regex + + if (! $path) { + $path = '.'; // if no path given, we assume CWD; + } + } + + parent::__construct($recursive ? new RecursiveDirectoryIterator($path) : new DirectoryIterator($path)); + } + + /** + * Checks for regex in current filename, or matches all if no regex specified + */ + #[\Override] + public function accept(): bool + { + /** @var RecursiveDirectoryIterator|DirectoryIterator $iterator */ + $iterator = $this->getInnerIterator(); + + return (bool) preg_match($this->regex, $iterator->getFilename()); + } +} diff --git a/src/Retrieval/Loader/Reader/AbstractReader.php b/src/Retrieval/Loader/Reader/AbstractReader.php new file mode 100644 index 0000000..223a241 --- /dev/null +++ b/src/Retrieval/Loader/Reader/AbstractReader.php @@ -0,0 +1,64 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader; + +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\FileNotFoundException; +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnreadableFileException; +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnsupportedFileException; +use Symfony\Component\Filesystem\Path; + +/** + * Interface AbstractReader. + * + * @author bernard-ng + */ +abstract readonly class AbstractReader +{ + /** + * Supported extensions regex pattern + */ + public const string SUPPORTED_EXTENSIONS_PATTERN = ''; + + /** + * @throws UnsupportedFileException If the file extension is not supported and the check is not skipped. + * @throws UnreadableFileException When the content cannot be read for any other reason + * @throws FileNotFoundException When the given file does not exist + */ + abstract public function readContent(string $path, bool $skipExtensionCheck = false): string; + + final public function supports(string $path): bool + { + $extension = Path::getExtension($path, forceLowerCase: true); + return (bool) preg_match(static::SUPPORTED_EXTENSIONS_PATTERN, $extension); + } + + final public function ensureSupported(string $path): void + { + $extension = Path::getExtension($path, forceLowerCase: true); + if ($this->supports($path) === false) { + throw new UnsupportedFileException([$extension, static::SUPPORTED_EXTENSIONS_PATTERN]); + } + } + + final public function ensureFileExists(string $path): void + { + if (file_exists($path) === false) { + throw new FileNotFoundException($path); + } + + if (is_readable($path) === false) { + throw new UnreadableFileException($path); + } + } +} diff --git a/src/Retrieval/Loader/Reader/DocReader.php b/src/Retrieval/Loader/Reader/DocReader.php new file mode 100644 index 0000000..785e38a --- /dev/null +++ b/src/Retrieval/Loader/Reader/DocReader.php @@ -0,0 +1,35 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader; + +/** + * Class DocReader. + * + * @author bernard-ng + */ +final readonly class DocReader extends AbstractReader +{ + public const string SUPPORTED_EXTENSIONS_PATTERN = '/docx?/'; + + #[\Override] + public function readContent(string $path, bool $skipExtensionCheck = false): string + { + if ($skipExtensionCheck === false) { + $this->ensureSupported($path); + $this->ensureFileExists($path); + } + + return ''; + } +} diff --git a/src/Retrieval/Loader/Reader/Exception/FileNotFoundException.php b/src/Retrieval/Loader/Reader/Exception/FileNotFoundException.php new file mode 100644 index 0000000..b2c2a49 --- /dev/null +++ b/src/Retrieval/Loader/Reader/Exception/FileNotFoundException.php @@ -0,0 +1,28 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader\Exception; + +/** + * Class UnsupportedFileException. + * + * @author bernard-ng + */ +final class FileNotFoundException extends \RuntimeException +{ + public function __construct(string $file, int $code = 0, \Throwable $previous = null) + { + $message = sprintf('Failed to open stream: No such file %s', $file); + parent::__construct($message, $code, $previous); + } +} diff --git a/src/Retrieval/Loader/Reader/Exception/UnreadableFileException.php b/src/Retrieval/Loader/Reader/Exception/UnreadableFileException.php new file mode 100644 index 0000000..4889cd5 --- /dev/null +++ b/src/Retrieval/Loader/Reader/Exception/UnreadableFileException.php @@ -0,0 +1,28 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader\Exception; + +/** + * Class UnsupportedFileException. + * + * @author bernard-ng + */ +final class UnreadableFileException extends \RuntimeException +{ + public function __construct(string $file, int $code = 0, \Throwable $previous = null) + { + $message = sprintf('Unable to read the content of %s', $file); + parent::__construct($message, $code, $previous); + } +} diff --git a/src/Retrieval/Loader/Reader/Exception/UnsupportedFileException.php b/src/Retrieval/Loader/Reader/Exception/UnsupportedFileException.php new file mode 100644 index 0000000..170c5b7 --- /dev/null +++ b/src/Retrieval/Loader/Reader/Exception/UnsupportedFileException.php @@ -0,0 +1,31 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader\Exception; + +/** + * Class UnsupportedFileException. + * + * @author bernard-ng + */ +final class UnsupportedFileException extends \RuntimeException +{ + public function __construct( + array $extensions = [], + int $code = 0, + \Throwable $previous = null + ) { + $message = vsprintf('The given %s file is not supported, this reader supports %s files', $extensions); + parent::__construct($message, $code, $previous); + } +} diff --git a/src/Retrieval/Loader/Reader/FileReader.php b/src/Retrieval/Loader/Reader/FileReader.php new file mode 100644 index 0000000..5441dc3 --- /dev/null +++ b/src/Retrieval/Loader/Reader/FileReader.php @@ -0,0 +1,52 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader; + +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnsupportedFileException; +use Symfony\Component\Filesystem\Path; + +/** + * Class FileReader. + * + * @author bernard-ng + */ +final readonly class FileReader extends AbstractReader +{ + public const string SUPPORTED_EXTENSIONS_PATTERN = '/txt|pdf|docx?/'; + + #[\Override] + public function readContent(string $path, bool $skipExtensionCheck = false): string + { + foreach ($this->getSupportedReaders() as $reader) { + if ($reader->supports($path)) { + $this->ensureFileExists($path); + return $reader->readContent($path, skipExtensionCheck: true); + } + } + + $extension = Path::getExtension($path, forceLowerCase: true); + throw new UnsupportedFileException([$extension, self::SUPPORTED_EXTENSIONS_PATTERN]); + } + + /** + * @return array + */ + private function getSupportedReaders(): array + { + return [ + new TxtReader(), + new PdfReader(), + ]; + } +} diff --git a/src/Retrieval/Loader/Reader/PdfReader.php b/src/Retrieval/Loader/Reader/PdfReader.php new file mode 100644 index 0000000..d8e4175 --- /dev/null +++ b/src/Retrieval/Loader/Reader/PdfReader.php @@ -0,0 +1,57 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader; + +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnreadableFileException; +use Smalot\PdfParser\{Config, Parser}; + +/** + * Class PdfReader. + * + * @author bernard-ng + */ +final readonly class PdfReader extends AbstractReader +{ + public const string SUPPORTED_EXTENSIONS_PATTERN = '/pdf/'; + + private Config $config; + + private Parser $parser; + + public function __construct() + { + $this->config = new Config(); + + // It won't retain image content anymore, but will use less memory too. + $this->config->setRetainImageContent(false); + $this->parser = new Parser(config: $this->config); + } + + #[\Override] + public function readContent(string $path, bool $skipExtensionCheck = false): string + { + if ($skipExtensionCheck === false) { + $this->ensureSupported($path); + $this->ensureFileExists($path); + } + + try { + $pdf = $this->parser->parseFile($path); + } catch (\Exception $e) { + throw new UnreadableFileException($path, previous: $e); + } + + return $pdf->getText(); + } +} diff --git a/src/Retrieval/Loader/Reader/TxtReader.php b/src/Retrieval/Loader/Reader/TxtReader.php new file mode 100644 index 0000000..7a50239 --- /dev/null +++ b/src/Retrieval/Loader/Reader/TxtReader.php @@ -0,0 +1,43 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Retrieval\Loader\Reader; + +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnreadableFileException; + +/** + * Class TxtReader. + * + * @author bernard-ng + */ +final readonly class TxtReader extends AbstractReader +{ + public const string SUPPORTED_EXTENSIONS_PATTERN = '/txt/'; + + #[\Override] + public function readContent(string $path, bool $skipExtensionCheck = false): string + { + if ($skipExtensionCheck === false) { + $this->ensureSupported($path); + $this->ensureFileExists($path); + } + + $content = file_get_contents($path); + + if ($content === false) { + throw new UnreadableFileException($path); + } + + return $content; + } +} diff --git a/src/Retrieval/Metadata.php b/src/Retrieval/Metadata.php index 1341aae..baa0d1e 100644 --- a/src/Retrieval/Metadata.php +++ b/src/Retrieval/Metadata.php @@ -33,6 +33,7 @@ public function __construct( /** * @throws \JsonException */ + #[\Override] public function __toString(): string { return json_encode([ diff --git a/src/Retrieval/Splitter/SplitterInterface.php b/src/Retrieval/Splitter/SplitterInterface.php index 3f5c12b..55ede0d 100644 --- a/src/Retrieval/Splitter/SplitterInterface.php +++ b/src/Retrieval/Splitter/SplitterInterface.php @@ -30,5 +30,5 @@ public function splitText(string $text): iterable; /** * @return iterable */ - public function createDocuments(string $text): iterable; + public function createDocuments(Document|string $text): iterable; } diff --git a/src/Retrieval/Splitter/TextSplitter.php b/src/Retrieval/Splitter/TextSplitter.php index e91bf5e..10a1561 100644 --- a/src/Retrieval/Splitter/TextSplitter.php +++ b/src/Retrieval/Splitter/TextSplitter.php @@ -72,8 +72,12 @@ public function splitText(string $text): iterable } #[\Override] - public function createDocuments(string $text): iterable + public function createDocuments(Document|string $text): iterable { + if ($text instanceof Document) { + $text = $text->content; + } + /** * @var int $index */ diff --git a/tests/Retrieval/Loader/Reader/FileReaderTest.php b/tests/Retrieval/Loader/Reader/FileReaderTest.php new file mode 100644 index 0000000..787bd4a --- /dev/null +++ b/tests/Retrieval/Loader/Reader/FileReaderTest.php @@ -0,0 +1,56 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Tests\Retrieval\Loader\Reader; + +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\FileNotFoundException; +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnsupportedFileException; +use Devscast\Lugha\Retrieval\Loader\Reader\FileReader; +use PHPUnit\Framework\TestCase; + +/** + * Class FileReaderTest. + * + * @author bernard-ng + */ +final class FileReaderTest extends TestCase +{ + private FileReader $reader; + + protected function setUp(): void + { + $this->reader = new FileReader(); + parent::setUp(); + } + + public function testReadContentWithTheBestReader(): void + { + $text = $this->reader->readContent(__DIR__ . '/../../../fixtures/document.txt'); + $pdf = $this->reader->readContent(__DIR__ . '/../../../fixtures/document.pdf'); + + $this->assertSame('hello world', $text); + $this->assertSame('helloworld', $pdf); + } + + public function testCannotReadUnsupportedFile(): void + { + $this->expectException(UnsupportedFileException::class); + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/document.docx'); + } + + public function testCannotReadNonExistingFile(): void + { + $this->expectException(FileNotFoundException::class); + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/does-not-exist.pdf'); + } +} diff --git a/tests/Retrieval/Loader/Reader/PdfReaderTest.php b/tests/Retrieval/Loader/Reader/PdfReaderTest.php new file mode 100644 index 0000000..7e13e36 --- /dev/null +++ b/tests/Retrieval/Loader/Reader/PdfReaderTest.php @@ -0,0 +1,63 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Tests\Retrieval\Loader\Reader; + +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\FileNotFoundException; +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnreadableFileException; +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnsupportedFileException; +use Devscast\Lugha\Retrieval\Loader\Reader\PdfReader; +use PHPUnit\Framework\TestCase; + +/** + * Class PdfReaderTest. + * + * @author bernard-ng + */ +final class PdfReaderTest extends TestCase +{ + private PdfReader $reader; + + protected function setUp(): void + { + $this->reader = new PdfReader(); + parent::setUp(); + } + + public function testReadContent(): void + { + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/document.pdf'); + $this->assertSame('helloworld', $content); // TODO: try to understand why whitespace is not taken into account + } + + public function testCannotReadNonExistingFile(): void + { + $this->expectException(FileNotFoundException::class); + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/does-not-exist.pdf'); + } + + public function testCannotReadFileWithoutReadPermission(): void + { + touch('/tmp/test.pdf'); + chmod('/tmp/test.pdf', 000); + + $this->expectException(UnreadableFileException::class); + $content = $this->reader->readContent('/tmp/test.pdf'); + } + + public function testReadContentOnUnsupportedFile(): void + { + $this->expectException(UnsupportedFileException::class); + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/document.txt'); + } +} diff --git a/tests/Retrieval/Loader/Reader/TxtReaderTest.php b/tests/Retrieval/Loader/Reader/TxtReaderTest.php new file mode 100644 index 0000000..af1afa4 --- /dev/null +++ b/tests/Retrieval/Loader/Reader/TxtReaderTest.php @@ -0,0 +1,63 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +declare(strict_types=1); + +namespace Devscast\Lugha\Tests\Retrieval\Loader\Reader; + +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\FileNotFoundException; +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnreadableFileException; +use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnsupportedFileException; +use Devscast\Lugha\Retrieval\Loader\Reader\TxtReader; +use PHPUnit\Framework\TestCase; + +/** + * Class TxtReaderTest. + * + * @author bernard-ng + */ +final class TxtReaderTest extends TestCase +{ + private TxtReader $reader; + + protected function setUp(): void + { + $this->reader = new TxtReader(); + parent::setUp(); + } + + public function testReadContent(): void + { + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/document.txt'); + $this->assertSame('hello world', $content); + } + + public function testCannotReadNonExistingFile(): void + { + $this->expectException(FileNotFoundException::class); + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/does-not-exist.txt'); + } + + public function testCannotReadFileWithoutReadPermission(): void + { + touch('/tmp/test.txt'); + chmod('/tmp/test.txt', 000); + + $this->expectException(UnreadableFileException::class); + $content = $this->reader->readContent('/tmp/test.txt'); + } + + public function testReadContentOnUnsupportedFile(): void + { + $this->expectException(UnsupportedFileException::class); + $content = $this->reader->readContent(__DIR__ . '/../../../fixtures/document.pdf'); + } +} diff --git a/tests/fixtures/document.docx b/tests/fixtures/document.docx new file mode 100644 index 0000000..0696fb8 Binary files /dev/null and b/tests/fixtures/document.docx differ diff --git a/tests/fixtures/document.pdf b/tests/fixtures/document.pdf new file mode 100644 index 0000000..0908db2 Binary files /dev/null and b/tests/fixtures/document.pdf differ diff --git a/tests/fixtures/document.txt b/tests/fixtures/document.txt new file mode 100644 index 0000000..95d09f2 --- /dev/null +++ b/tests/fixtures/document.txt @@ -0,0 +1 @@ +hello world \ No newline at end of file