From 737e25b7f9549c4308308d95fc90bdf5135f1f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Tue, 12 Nov 2024 20:49:30 +0100 Subject: [PATCH] /download endpoint implemented The /download endpoint allows a batch download of multiple resources in a zip file. Also, the content-disposition response heeader for binaries uses the *=UTF-8''{fileName} syntax now to clearly indicate the file name encoding. --- composer.json | 4 +- config-sample.yaml | 6 + src/acdhOeaw/arche/core/Auth.php | 9 +- src/acdhOeaw/arche/core/BinaryPayload.php | 4 +- src/acdhOeaw/arche/core/Download.php | 215 +++++++++++++++++++++ src/acdhOeaw/arche/core/RestController.php | 7 + tests/DownloadTest.php | 212 ++++++++++++++++++++ tests/RestTest.php | 4 +- tests/TestBase.php | 7 +- tests/UserApiTest.php | 4 +- tests/config.yaml | 5 + 11 files changed, 464 insertions(+), 13 deletions(-) create mode 100644 src/acdhOeaw/arche/core/Download.php create mode 100644 tests/DownloadTest.php diff --git a/composer.json b/composer.json index c4945d8..9b14607 100644 --- a/composer.json +++ b/composer.json @@ -20,7 +20,8 @@ "zozlak/http-accept": ">=0.1.0 <1", "zozlak/logging": "^1", "zozlak/rdf-constants": "^1", - "php-amqplib/php-amqplib": "^3.1" + "php-amqplib/php-amqplib": "^3.1", + "maennchen/zipstream-php": "^3.1" }, "autoload": { "psr-4": { @@ -29,6 +30,7 @@ } }, "require-dev": { + "ext-zip": "*", "phpunit/phpunit": "*", "zozlak/yaml-merge": "^1", "phpstan/phpstan": "*" diff --git a/config-sample.yaml b/config-sample.yaml index ec1fa52..8b5aff5 100644 --- a/config-sample.yaml +++ b/config-sample.yaml @@ -214,6 +214,12 @@ rest: withReferences: X-WITH-REFERENCES resourceProperties: X-RESOURCE-PROPERTIES relativesProperties: X-RELATIVES-PROPERTIES +download: + # store or deflate + compressionMethod: store + compressionLevel: ~ + fileName: data.zip + strict: false schema: id: https://vocabs.acdh.oeaw.ac.at/schema#hasIdentifier label: https://vocabs.acdh.oeaw.ac.at/schema#hasTitle diff --git a/src/acdhOeaw/arche/core/Auth.php b/src/acdhOeaw/arche/core/Auth.php index 466828a..81e6279 100644 --- a/src/acdhOeaw/arche/core/Auth.php +++ b/src/acdhOeaw/arche/core/Auth.php @@ -104,7 +104,7 @@ public function checkCreateRights(): void { } public function checkAccessRights(int $resId, string $privilege, - bool $metadataRead): void { + bool $metadataRead, bool $deny = true): void { $c = RC::$config->accessControl; if ($metadataRead && !$c->enforceOnMetadata || $this->isAdmin) { return; @@ -115,7 +115,11 @@ public function checkAccessRights(int $resId, string $privilege, $allowed = json_decode($allowed) ?? []; $default = $c->defaultAction->$privilege ?? self::DEFAULT_DENY; if (count(array_intersect($this->userRoles, $allowed)) === 0 && $default !== self::DEFAULT_ALLOW) { - $this->denyAccess($allowed); + if ($deny) { + $this->denyAccess($allowed); + } else { + throw new RepoException('Unauthorized', $this->isPublic() ? 401 : 403); + } } } @@ -226,7 +230,6 @@ public function denyAccess(array $allowed): void { throw new RepoException((string) $resp->getBody(), $resp->getStatusCode(), headers: $headers); } } - RC::$log->alert("FOO! " . implode(',', $cookieHeader)); throw new RepoException('Forbidden', 403, headers: $cookieHeader); } diff --git a/src/acdhOeaw/arche/core/BinaryPayload.php b/src/acdhOeaw/arche/core/BinaryPayload.php index 94ea6d8..f0ea268 100644 --- a/src/acdhOeaw/arche/core/BinaryPayload.php +++ b/src/acdhOeaw/arche/core/BinaryPayload.php @@ -174,7 +174,7 @@ public function getHeaders(): array { throw new NoBinaryException(); } if (!empty($data->filename)) { - $headers['Content-Disposition'] = 'attachment; filename="' . $data->filename . '"'; + $headers['Content-Disposition'] = "attachment; filename*=UTF-8''" . rawurlencode($data->filename); } if (!empty($data->mime)) { $headers['Content-Type'] = $data->mime; @@ -278,7 +278,7 @@ private function getRequestMetadataRaw(): array { $fileName = null; if (preg_match('/^attachment; filename=/', $contentDisposition)) { - $fileName = (string) preg_replace('/^attachment; filename="?/', '', $contentDisposition); + $fileName = (string) preg_replace('/^attachment; filename(=|[*]=.*\'.*\')"?/', '', $contentDisposition); $fileName = (string) preg_replace('/"$/', '', $fileName); RC::$log->debug("\t\tfile name: $fileName"); } diff --git a/src/acdhOeaw/arche/core/Download.php b/src/acdhOeaw/arche/core/Download.php new file mode 100644 index 0000000..cfa1bda --- /dev/null +++ b/src/acdhOeaw/arche/core/Download.php @@ -0,0 +1,215 @@ +> + */ + private array $parents; + private PDOStatement $parentQuery; + + /** + * + * @var array + */ + private array $parentQueryParam; + + public function get(): void { + $ids = $_GET['ids'] ?? $_POST['ids'] ?? []; + if (!is_array($ids)) { + $ids = [$ids]; + } + if (count($ids) === 0) { + throw new RepoException('No resources identifiers provided'); + } + + $allIds = $this->collectChildren($ids); + unset($ids); + + $skip = (bool) json_decode($_GET['skipUnauthorized'] ?? false); // so "false" is turned into false + $validIds = $this->checkAccessRights($allIds, $skip); + unset($allIds); + if (count($validIds) === 0) { + throw new RepoException("Unauthorized to download all requested resources", 403); + } + + // create a zip + $cfg = RC::$config->download; + $strict = strtoupper($_GET['strict'] ?? $cfg->strict ?? self::DEFAULT_STRICT); + $method = match ($cfg->compressionMethod ?? '') { + 'store' => CompressionMethod::STORE, + 'deflate' => CompressionMethod::DEFLATE, + default => self::DEFAULT_COMPRESSION_METHOD, + }; + $level = $cfg->compressionLevel ?? self::DEFAULT_COMPRESSION_LEVEL; + $fileName = $cfg->fileName ?? self::DEFAULT_FILE_NAME; + + $metaQuery = RC::$pdo->prepare(" + SELECT m1.value AS filename, m2.value AS lastmod, m3.value_n AS filesize, r.target_id AS parent + FROM + metadata m1 + JOIN metadata m2 USING (id) + JOIN metadata m3 USING (id) + LEFT JOIN relations r ON m1.id = r.id AND r.property = ? + WHERE + m1.id = ? + AND m1.property = ? + AND m2.property = ? + AND m3.property = ? + "); + $metaQueryParam = [ + RC::$schema->parent, + null, + RC::$schema->fileName, + RC::$schema->binaryModificationDate, + RC::$schema->binarySize, + ]; + $this->parents = []; + unset($this->parentQuery); + unset($this->parentQueryParam); + $zip = new ZipStream(defaultCompressionMethod: $method, defaultDeflateLevel: $level, enableZip64: !$strict, defaultEnableZeroHeader: !$strict, outputName: $fileName); + foreach ($validIds as $id) { + $binary = new BinaryPayload($id); + $path = $binary->getPath(); + if (!file_exists($path)) { + continue; // metadata-only resource + } + $metaQueryParam[1] = $id; + $metaQuery->execute($metaQueryParam); + $meta = $metaQuery->fetchObject(); + $this->fetchParentsMeta($id, $meta); + $filename = $meta->filename; + $pid = (string) $meta->parent; + while (!empty($pid)) { + $filename = $this->parents[$pid]['filename'] . '/' . $filename; + $pid = $this->parents[$pid]['parent']; + } + $zip->addFileFromPath($filename, $path, lastModificationDateTime: new DateTimeImmutable($meta->lastmod), exactSize: $meta->filesize); + } + $zip->finish(); + } + + public function options(int $code = 204): void { + http_response_code($code); + header('Allow: OPTIONS, HEAD, GET, POST'); + } + + private function fetchParentsMeta(int $id, object $meta): void { + $this->parentQuery ??= RC::$pdo->prepare(" + SELECT r.id, n, COALESCE(m1.value, m2.value) AS filename + FROM + get_relatives(?, ?, 0, -999999, false, false) r + LEFT JOIN metadata m1 ON r.id = m1.id AND m1.property = ? + LEFT JOIN metadata m2 ON r.id = m2.id AND m2.property = ? + ORDER BY n DESC + "); + $this->parentQueryParam ??= [ + null, + RC::$schema->parent, + RC::$schema->fileName, + RC::$schema->label, + ]; + if ($meta->parent !== null && !isset($this->parents[$meta->parent])) { + $this->parentQueryParam[0] = $id; + $this->parentQuery->execute($this->parentQueryParam); + $parentsMeta = $this->parentQuery->fetchAll(PDO::FETCH_OBJ); + for ($i = 0; $i < count($parentsMeta); $i++) { + $pid = (string) $parentsMeta[$i]->id; + if (isset($tthis->parents[$pid])) { + break; + } + $this->parents[$pid] = [ + 'filename' => preg_replace(self::FORBIDDEN_FILENAME_CHARS_REGEX, self::FORBIDDEN_FILENAME_CHARS_REPLACE, $parentsMeta[$i]->filename), + 'parent' => (string) ($parentsMeta[$i + 1] ?? null)?->id, + ]; + } + } + } + + /** + * + * @param array $ids + * @return array + */ + private function collectChildren(array $ids): array { + $query = RC::$pdo->prepare("SELECT id FROM get_relatives(?, ?, 999999, 0, false, false)"); + $param = [null, RC::$schema->parent]; + foreach ($ids as $id) { + $param[0] = $id; + $query->execute($param); + while ($i = $query->fetchColumn()) { + $allIds[(string) $i] = ''; + } + } + return $allIds; + } + + /** + * + * @param array $ids + * @return array + */ + private function checkAccessRights(array $ids, bool $skipUnauthorized): array { + $validIds = []; + foreach (array_keys($ids) as $id) { + try { + RC::$log->debug("Testing $id"); + $id = (int) $id; + RC::$auth->checkAccessRights($id, 'read', false); + RC::$log->debug(" passed"); + $validIds[] = $id; + } catch (RepoException $e) { + if (!$skipUnauthorized || !in_array($e->getCode(), [401, 403])) { + throw $e; + } + } + } + return $validIds; + } +} diff --git a/src/acdhOeaw/arche/core/RestController.php b/src/acdhOeaw/arche/core/RestController.php index 3efe758..590bfd1 100644 --- a/src/acdhOeaw/arche/core/RestController.php +++ b/src/acdhOeaw/arche/core/RestController.php @@ -208,6 +208,13 @@ static public function handleRequest(): void { } else { $search->options(405); } + } elseif ($path === 'download') { + $dwnld = new Download(); + if ($method === 'Get' || $method === 'Post') { + $dwnld->get(); + } else { + $dwnld->options($method === 'Options' ? 204 : 405); + } } elseif (preg_match('>^([0-9]+/?)?(metadata|tombstone)?$>', $path)) { $collection = $suffix = ''; $id = null; diff --git a/tests/DownloadTest.php b/tests/DownloadTest.php new file mode 100644 index 0000000..efeb1ac --- /dev/null +++ b/tests/DownloadTest.php @@ -0,0 +1,212 @@ +createBinaryResource(); + $dwnldUri = self::$baseUrl . 'download?ids=' . preg_replace('`^.*/`', '', $uri); + $req = new Request('get', $dwnldUri, ['eppn' => 'admin']); + $resp = self::$client->send($req); + $content = $this->testZipBasics($resp, 1); + $refContent = [basename(self::BINARY_RES_PATH) => file_get_contents(self::BINARY_RES_PATH)]; + $this->assertEquals($refContent, $content); + } + + public function testCollection(): void { + // create resource structure + $serializer = new NQuadsSerializer(); + $txId = $this->beginTransaction(); + $headers = [ + 'eppn' => 'admin', + 'content-type' => 'application/n-triples', + self::$config->rest->headers->transactionId => $txId, + ]; + $collections = []; + $binaries = []; + for ($i = 0; $i < 3; $i++) { + $sbj = DF::namedNode(self::$baseUrl . '/metadata'); + $meta = new Dataset(); + $meta->add(DF::quad($sbj, self::$schema->label, DF::literal("collection $i", "en"))); + if ($i > 0) { + $meta->add(DF::quad($sbj, self::$schema->parent, $collections[$i - 1])); + } + $collections[$i] = DF::namedNode($this->createMetadataResource($meta, $txId)); + for ($j = 0; $j < 2; $j++) { + $uri = $this->createBinaryResource($txId, $j === 0 ? self::BINARY_RES_PATH : __FILE__); + $binaries[$i][] = $uri; + $sbj = DF::namedNode($uri); + $meta = new Dataset(); + $meta->add(DF::quad($sbj, self::$schema->parent, $collections[$i])); + $req = new Request('patch', "$uri/metadata", $headers, $serializer->serialize($meta)); + $resp = self::$client->send($req); + $this->assertEquals(200, $resp->getStatusCode()); + } + } + $this->commitTransaction($txId); + + // two arbitrary binaries using GET + $ids = [$binaries[2][1], $binaries[0][0]]; + $ids = array_map(fn($x) => preg_replace('`^.*/`', '', $x), $ids); + $uri = self::$baseUrl . 'download?' . http_build_query(['ids' => $ids]); + $req = new Request('get', $uri, ['eppn' => 'admin']); + $resp = self::$client->send($req); + $content = $this->testZipBasics($resp, 2); + $refContent = [ + 'collection 0/test.ttl' => file_get_contents(self::BINARY_RES_PATH), + 'collection 0/collection 1/collection 2/DownloadTest.php' => file_get_contents(__FILE__), + ]; + $this->assertEquals($refContent, $content); + + // lowest collection using GET + $ids = [(string) $collections[2]]; + $ids = array_map(fn($x) => preg_replace('`^.*/`', '', $x), $ids); + $uri = self::$baseUrl . 'download?' . http_build_query(['ids' => $ids]); + $req = new Request('get', $uri, ['eppn' => 'admin']); + $resp = self::$client->send($req); + $content = $this->testZipBasics($resp, 2); + $refContent = [ + 'collection 0/collection 1/collection 2/test.ttl' => file_get_contents(self::BINARY_RES_PATH), + 'collection 0/collection 1/collection 2/DownloadTest.php' => file_get_contents(__FILE__), + ]; + + // middle collection with arbitrary files using POST + $ids = [(string) $collections[1], $binaries[0][1], $binaries[2][0]]; + $ids = array_map(fn($x) => preg_replace('`^.*/`', '', $x), $ids); + $uri = self::$baseUrl . 'download'; + $headers = ['eppn' => 'admin', 'content-type' => 'application/x-www-form-urlencoded']; + $req = new Request('post', $uri, $headers, http_build_query(['ids' => $ids])); + $resp = self::$client->send($req); + $content = $this->testZipBasics($resp, 5); + $refContent = [ + 'collection 0/DownloadTest.php' => file_get_contents(__FILE__), + 'collection 0/collection 1/test.ttl' => file_get_contents(self::BINARY_RES_PATH), + 'collection 0/collection 1/DownloadTest.php' => file_get_contents(__FILE__), + 'collection 0/collection 1/collection 2/test.ttl' => file_get_contents(self::BINARY_RES_PATH), + 'collection 0/collection 1/collection 2/DownloadTest.php' => file_get_contents(__FILE__), + ]; + } + + public function testAuth(): void { + // create resource structure + $username = 'ordinaryUser'; + $serializer = new NQuadsSerializer(); + $txId = $this->beginTransaction(); + $headers = [ + 'eppn' => 'admin', + 'content-type' => 'application/n-triples', + self::$config->rest->headers->transactionId => $txId, + ]; + $binaries = []; + $sbj = DF::namedNode(self::$baseUrl . '/metadata'); + $meta = new Dataset(); + $meta->add(DF::quad($sbj, self::$schema->label, DF::literal("collection_1->: ąę", "en"))); + $collection = DF::namedNode($this->createMetadataResource($meta, $txId)); + for ($j = 0; $j < 2; $j++) { + $uri = $this->createBinaryResource($txId, $j === 0 ? self::BINARY_RES_PATH : __FILE__); + $binaries[] = $uri; + $sbj = DF::namedNode($uri); + $meta = new Dataset(); + $meta->add(DF::quad($sbj, self::$schema->parent, $collection)); + if ($j === 0) { + $meta->add(DF::quad($sbj, DF::namedNode(self::$config->accessControl->schema->read), DF::literal($username))); + } + $req = new Request('patch', "$uri/metadata", $headers, $serializer->serialize($meta)); + $resp = self::$client->send($req); + $this->assertEquals(200, $resp->getStatusCode()); + } + $this->commitTransaction($txId); + + $ids = [(string) $collection]; + $ids = array_map(fn($x) => preg_replace('`^.*/`', '', $x), $ids); + + // without skipping unauthorized + $uri = self::$baseUrl . 'download?' . http_build_query(['ids' => $ids]); + $req = new Request('get', $uri); + $resp = self::$client->send($req); + $this->assertEquals(403, $resp->getStatusCode()); + $req = new Request('get', $uri, ['eppn' => $username]); + $resp = self::$client->send($req); + $this->assertEquals(403, $resp->getStatusCode()); + + // with skipping unauthorized + $param = ['ids' => $ids, 'skipUnauthorized' => true]; + $uri = self::$baseUrl . 'download?' . http_build_query($param); + $req = new Request('get', $uri); + $resp = self::$client->send($req); + $this->assertEquals(403, $resp->getStatusCode()); + $this->assertEquals("Unauthorized to download all requested resources", (string) $resp->getBody()); + $req = new Request('get', $uri, ['eppn' => $username]); + $resp = self::$client->send($req); + $content = $this->testZipBasics($resp, 1); + $refContent = ['collection_1-__ ąę/test.ttl' => file_get_contents(self::BINARY_RES_PATH)]; + $this->assertEquals($refContent, $content); + } + + /** + * + * @return array + */ + private function testZipBasics(Response $resp, int $expectedCount): array { + $this->assertEquals(200, $resp->getStatusCode()); + $this->assertEquals(["attachment; filename*=UTF-8''data.zip"], $resp->getHeader('Content-Disposition')); + $this->assertEquals(['application/x-zip'], $resp->getHeader('Content-Type')); + file_put_contents(self::TMP_ZIP, (string) $resp->getBody()); + $zip = new ZipArchive(); + $this->assertTrue($zip->open(self::TMP_ZIP)); + $this->assertEquals($expectedCount, $zip->count()); + $content = []; + for ($i = 0; $i < $expectedCount; $i++) { + $content[$zip->getNameIndex($i)] = $zip->getFromIndex($i); + } + ksort($content); + return $content; + } +} diff --git a/tests/RestTest.php b/tests/RestTest.php index 03506ab..740f823 100644 --- a/tests/RestTest.php +++ b/tests/RestTest.php @@ -283,7 +283,7 @@ public function testHead(): void { $req = new Request('head', $location, $this->getHeaders()); $resp = self::$client->send($req); $this->assertEquals(200, $resp->getStatusCode()); - $this->assertEquals('attachment; filename="test.ttl"', $resp->getHeader('Content-Disposition')[0] ?? ''); + $this->assertEquals("attachment; filename*=UTF-8''test.ttl", $resp->getHeader('Content-Disposition')[0] ?? ''); $this->assertEquals('text/turtle;charset=UTF-8', $resp->getHeader('Content-Type')[0] ?? ''); // In HTTP/1.1 and newer server may respond with transfer-encoding: chuncked which does not contain the content-length header if (count($resp->getHeader('Content-Length')) > 0) { @@ -1225,7 +1225,7 @@ public function testSkipContentDisposition(): void { $resp = self::$client->send($req); $this->assertEquals(200, $resp->getStatusCode()); $this->assertEquals(file_get_contents(self::BINARY_RES_PATH), (string) $resp->getBody()); - $refHeader = ['attachment; filename="' . basename(self::BINARY_RES_PATH) . '"']; + $refHeader = ["attachment; filename*=UTF-8''" . basename(self::BINARY_RES_PATH)]; $this->assertEquals($refHeader, $resp->getHeader('Content-Disposition')); $req = new Request('get', $location . '?skipContentDisposition=', $this->getHeaders()); diff --git a/tests/TestBase.php b/tests/TestBase.php index 53dd045..c01a5ff 100644 --- a/tests/TestBase.php +++ b/tests/TestBase.php @@ -175,7 +175,8 @@ protected function createMetadata(?string $uri = null): DatasetNode { return $g; } - protected function createBinaryResource(?int $txId = null): string { + protected function createBinaryResource(?int $txId = null, + string $path = self::BINARY_RES_PATH): string { $extTx = $txId !== null; if (!$extTx) { $txId = $this->beginTransaction(); @@ -183,11 +184,11 @@ protected function createBinaryResource(?int $txId = null): string { $headers = [ self::$config->rest->headers->transactionId => $txId, - 'Content-Disposition' => 'attachment; filename="test.ttl"', + 'Content-Disposition' => 'attachment; filename="' . basename($path) . '"', 'Content-Type' => 'text/turtle', 'Eppn' => 'admin', ]; - $body = (string) file_get_contents(self::BINARY_RES_PATH); + $body = (string) file_get_contents($path); $req = new Request('post', self::$baseUrl, $headers, $body); $resp = self::$client->send($req); diff --git a/tests/UserApiTest.php b/tests/UserApiTest.php index f28bdf7..e97efef 100644 --- a/tests/UserApiTest.php +++ b/tests/UserApiTest.php @@ -273,14 +273,14 @@ public function testUserLogout(): void { $req = new Request('get', self::$baseUrl . 'user/logout?redirect=' . rawurldecode('/foo/bar'), $headers); $resp = self::$client->send($req); $this->assertEquals(401, $resp->getStatusCode()); - $this->assertEquals(['0: url=/foo/bar'], $resp->getHeader('Refresh')); + $this->assertEquals(['0; url=/foo/bar'], $resp->getHeader('Refresh')); // logout with invalid credentials and redirect $req = new Request('get', self::$baseUrl . 'user/logout?redirect=/foo', [ 'Authorization' => 'Basic ' . base64_encode('x:y')]); $resp = self::$client->send($req); $this->assertEquals(401, $resp->getStatusCode()); - $this->assertEquals(['0: url=/foo'], $resp->getHeader('Refresh')); + $this->assertEquals(['0; url=/foo'], $resp->getHeader('Refresh')); } diff --git a/tests/config.yaml b/tests/config.yaml index 3979ae3..418ace3 100644 --- a/tests/config.yaml +++ b/tests/config.yaml @@ -162,6 +162,11 @@ rest: resourceProperties: X-RESOURCE-PROPERTIES relativesProperties: X-RELATIVES-PROPERTIES cors: __origin__ +download: + compressionMethod: store + compressionLevel: ~ + fileName: data.zip + strict: false schema: id: https://id label: https://label