diff --git a/doc/release-notes/7084-crawlable-file-access.md b/doc/release-notes/7084-crawlable-file-access.md new file mode 100644 index 00000000000..0e721728e28 --- /dev/null +++ b/doc/release-notes/7084-crawlable-file-access.md @@ -0,0 +1,29 @@ + ## Release Highlights + +### A new file access API + +A new api offers *crawlable* access view of the folders and files within a datset: + +``` + /api/datasets//dirindex/ +``` + +will output a simple html listing, based on the standard Apache +directory index, with Access API download links for individual files, +and recursive calls to the API above for sub-folders. (See the +documentation entry in the guides for more information). + +Using this API, ``wget --recursive`` (or similar crawling client) can +be used to download all the files in a dataset, preserving the file +names and folder structure; without having to use the download-as-zip +API. In addition to being faster (zipping is a relatively +resource-intensive operation on the server side), this process can be +restarted if interrupted (with ``wget --continue`` or equivalent) - +unlike zipped multi-file downloads that always have to start from the +beginning. + +On a system that uses S3 with download redirects, the individual file +downloads will be handled by S3 directly, without having to be proxied +through the Dataverse application. + + diff --git a/doc/sphinx-guides/source/api/img/dataset_page_files_view.png b/doc/sphinx-guides/source/api/img/dataset_page_files_view.png new file mode 100644 index 00000000000..dbf20afa93e Binary files /dev/null and b/doc/sphinx-guides/source/api/img/dataset_page_files_view.png differ diff --git a/doc/sphinx-guides/source/api/img/dataset_page_tree_view.png b/doc/sphinx-guides/source/api/img/dataset_page_tree_view.png new file mode 100644 index 00000000000..3c505bc1598 Binary files /dev/null and b/doc/sphinx-guides/source/api/img/dataset_page_tree_view.png differ diff --git a/doc/sphinx-guides/source/api/img/index_view_subfolder.png b/doc/sphinx-guides/source/api/img/index_view_subfolder.png new file mode 100644 index 00000000000..6c940ad2354 Binary files /dev/null and b/doc/sphinx-guides/source/api/img/index_view_subfolder.png differ diff --git a/doc/sphinx-guides/source/api/img/index_view_top.png b/doc/sphinx-guides/source/api/img/index_view_top.png new file mode 100644 index 00000000000..da87a0cd3eb Binary files /dev/null and b/doc/sphinx-guides/source/api/img/index_view_top.png differ diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 44d099a1685..c02733629f4 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -783,9 +783,92 @@ List Files in a Dataset The fully expanded example above (without environment variables) looks like this: .. code-block:: bash - + curl https://demo.dataverse.org/api/datasets/24/versions/1.0/files +View Dataset Files and Folders as a Directory Index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +|CORS| Provides a *crawlable* view of files and folders within the given dataset and version: + +.. code-block:: bash + + curl $SERVER_URL/api/datasets/$ID/dirindex/ + +Optional parameters: + +* ``folder`` - A subfolder within the dataset (default: top-level view of the dataset) +* ``version`` - Specifies the version (default: latest published version) +* ``original=true`` - Download original versions of ingested tabular files. + +This API outputs a simple html listing, based on the standard Apache +directory index, with Access API download links for individual files, +and recursive calls to the API above for sub-folders. + +Using this API, ``wget --recursive`` (or a similar crawling client) can +be used to download all the files in a dataset, preserving the file +names and folder structure; without having to use the download-as-zip +API. In addition to being faster (zipping is a relatively +resource-intensive operation on the server side), this process can be +restarted if interrupted (with ``wget --continue`` or equivalent) - +unlike zipped multi-file downloads that always have to start from the +beginning. + +On a system that uses S3 with download redirects, the individual file +downloads will be handled by S3 directly, without having to be proxied +through the Dataverse application. + +For example, if you have a dataset version with 2 files, one with the folder named "subfolder": + +.. |image1| image:: ./img/dataset_page_files_view.png + +or, as viewed as a tree on the dataset page: + +.. |image2| image:: ./img/dataset_page_tree_view.png + +The output of the API for the top-level folder (``/api/datasets/{dataset}/dirindex/``) will be as follows: + +.. |image3| image:: ./img/index_view_top.png + +with the underlying html source: + +.. code-block:: html + + + Index of folder / +

Index of folder / in dataset doi:XXX/YY/ZZZZ (v. MM)

+ + + + + +
NameLast ModifiedSizeDescription

subfolder/ - -  
testfile.txt13-January-2021 22:3519 B 
+ +The ``/dirindex/?folder=subfolder`` link above will produce the following view: + +.. |image4| image:: ./img/index_view_subfolder.png + +with the html source as follows: + +.. code-block:: html + + + Index of folder /subfolder +

Index of folder /subfolder in dataset doi:XXX/YY/ZZZZ (v. MM)

+ + + + +
NameLast ModifiedSizeDescription

50by1000.tab11-January-2021 09:31102.5 KB 
+ +An example of a ``wget`` command line for crawling ("recursive downloading") of the files and folders in a dataset: + +.. code-block:: bash + + wget -r -e robots=off -nH --cut-dirs=3 --content-disposition https://demo.dataverse.org/api/datasets/24/dirindex/ + +.. note:: In addition to the files and folders in the dataset, the command line above will also save the directory index of each folder, in a separate folder "dirindex". + List All Metadata Blocks for a Dataset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index dba9efac5aa..067cba23d04 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -214,6 +214,33 @@ public List getFileMetadatasSortedByLabelAndFolder() { Collections.sort(fileMetadatasCopy, FileMetadata.compareByLabelAndFolder); return fileMetadatasCopy; } + + public List getFileMetadatasFolderListing(String folderName) { + ArrayList fileMetadatasCopy = new ArrayList<>(); + HashSet subFolders = new HashSet<>(); + + for (FileMetadata fileMetadata : fileMetadatas) { + String thisFolder = fileMetadata.getDirectoryLabel() == null ? "" : fileMetadata.getDirectoryLabel(); + + if (folderName.equals(thisFolder)) { + fileMetadatasCopy.add(fileMetadata); + } else if (thisFolder.startsWith(folderName)) { + String subFolder = "".equals(folderName) ? thisFolder : thisFolder.substring(folderName.length() + 1); + if (subFolder.indexOf('/') > 0) { + subFolder = subFolder.substring(0, subFolder.indexOf('/')); + } + + if (!subFolders.contains(subFolder)) { + fileMetadatasCopy.add(fileMetadata); + subFolders.add(subFolder); + } + + } + } + Collections.sort(fileMetadatasCopy, FileMetadata.compareByFullPath); + + return fileMetadatasCopy; + } public void setFileMetadatas(List fileMetadatas) { this.fileMetadatas = fileMetadatas; diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java index 99730a3a024..7b0fb0fd76c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java @@ -575,6 +575,16 @@ public int compare(FileMetadata o1, FileMetadata o2) { } }; + public static final Comparator compareByFullPath = new Comparator() { + @Override + public int compare(FileMetadata o1, FileMetadata o2) { + String folder1 = StringUtil.isEmpty(o1.getDirectoryLabel()) ? "" : o1.getDirectoryLabel().toUpperCase() + "/"; + String folder2 = StringUtil.isEmpty(o2.getDirectoryLabel()) ? "" : o2.getDirectoryLabel().toUpperCase() + "/"; + + return folder1.concat(o1.getLabel().toUpperCase()).compareTo(folder2.concat(o2.getLabel().toUpperCase())); + } + }; + public String toPrettyJSON(){ diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Access.java b/src/main/java/edu/harvard/iq/dataverse/api/Access.java index e98bc8aa1fa..281f0a76ce1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Access.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Access.java @@ -270,11 +270,21 @@ private DataFile findDataFileOrDieWrapper(String fileId){ } - @Path("datafile/{fileId}") + @Path("datafile/{fileId:.+}") @GET @Produces({"application/xml"}) public DownloadInstance datafile(@PathParam("fileId") String fileId, @QueryParam("gbrecs") boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ { - + + if (fileId.indexOf('/') > -1) { + // This is for embedding folder names into the Access API URLs; + // something like /api/access/datafile/folder/subfolder/1234 + // instead of the normal /api/access/datafile/1234 notation. + // this is supported only for recreating folders during recursive downloads - + // i.e. they are embedded into the URL for the remote client like wget, + // but can be safely ignored here. + fileId = fileId.substring(fileId.lastIndexOf('/') + 1); + } + DataFile df = findDataFileOrDieWrapper(fileId); GuestbookResponse gbr = null; diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 655cdafe04c..4ea4144fbf9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -156,6 +156,7 @@ import org.glassfish.jersey.media.multipart.FormDataParam; import com.amazonaws.services.s3.model.PartETag; +import edu.harvard.iq.dataverse.FileMetadata; import java.util.Map.Entry; @Path("datasets") @@ -466,6 +467,42 @@ public Response getVersionFiles( @PathParam("id") String datasetId, @PathParam(" getDatasetVersionOrDie(req, versionId, findDatasetOrDie(datasetId), uriInfo, headers).getFileMetadatas()))); } + @GET + @Path("{id}/dirindex") + @Produces("text/html") + public Response getFileAccessFolderView(@PathParam("id") String datasetId, @QueryParam("version") String versionId, @QueryParam("folder") String folderName, @QueryParam("original") Boolean originals, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) { + + folderName = folderName == null ? "" : folderName; + versionId = versionId == null ? ":latest-published" : versionId; + + DatasetVersion version; + try { + DataverseRequest req = createDataverseRequest(findUserOrDie()); + version = getDatasetVersionOrDie(req, versionId, findDatasetOrDie(datasetId), uriInfo, headers); + } catch (WrappedResponse wr) { + return wr.getResponse(); + } + + String output = FileUtil.formatFolderListingHtml(folderName, version, "", originals != null && originals); + + // return "NOT FOUND" if there is no such folder in the dataset version: + + if ("".equals(output)) { + return notFound("Folder " + folderName + " does not exist"); + } + + + String indexFileName = folderName.equals("") ? ".index.html" + : ".index-" + folderName.replace('/', '_') + ".html"; + response.setHeader("Content-disposition", "attachment; filename=\"" + indexFileName + "\""); + + + return Response.ok() + .entity(output) + //.type("application/html"). + .build(); + } + @GET @Path("{id}/versions/{versionId}/metadata") public Response getVersionMetadata( @PathParam("id") String datasetId, @PathParam("versionId") String versionId, @Context UriInfo uriInfo, @Context HttpHeaders headers) { diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 17d997ed25e..da7eece7826 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -38,6 +38,16 @@ import edu.harvard.iq.dataverse.ingest.IngestServiceBean; import edu.harvard.iq.dataverse.ingest.IngestServiceShapefileHelper; import edu.harvard.iq.dataverse.ingest.IngestableDataChecker; +import edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatDoc; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.HTML_H1; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.HTML_TABLE_HDR; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatTitle; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatTable; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatTableCell; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatLink; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatTableCellAlignRight; +import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatTableRow; import java.awt.image.BufferedImage; import java.io.BufferedInputStream; import java.io.File; @@ -211,6 +221,8 @@ public class FileUtil implements java.io.Serializable { FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_UNKNOWN, FILE_THUMBNAIL_CLASS_OTHER); FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_ARCHIVE, FILE_THUMBNAIL_CLASS_PACKAGE); } + + private static final String FILE_LIST_DATE_FORMAT = "d-MMMM-yyyy HH:mm"; /** * This string can be prepended to a Base64-encoded representation of a PNG @@ -1928,5 +1940,125 @@ public static boolean isFileAlreadyUploaded(DataFile dataFile, Map checksumMapNe checksumMapNew.put(chksum, dataFile); return false; } + + public static String formatFolderListingHtml(String folderName, DatasetVersion version, String apiLocation, boolean originals) { + String title = formatTitle("Index of folder /" + folderName); + List fileMetadatas = version.getFileMetadatasFolderListing(folderName); + + if (fileMetadatas == null || fileMetadatas.isEmpty()) { + return ""; + } + + String persistentId = version.getDataset().getGlobalId().asString(); + + StringBuilder sb = new StringBuilder(); + + String versionTag = version.getFriendlyVersionNumber(); + versionTag = "DRAFT".equals(versionTag) ? "Draft Version" : "v. " + versionTag; + sb.append(HtmlFormatUtil.formatTag("Index of folder /" + folderName + + " in dataset " + persistentId + + " (" + versionTag + ")", HTML_H1)); + sb.append("\n"); + sb.append(formatFolderListingTableHtml(folderName, fileMetadatas, apiLocation, originals)); + + String body = sb.toString(); + + return formatDoc(title, body); + } + + private static String formatFolderListingTableHtml(String folderName, List fileMetadatas, String apiLocation, boolean originals) { + StringBuilder sb = new StringBuilder(); + + sb.append(formatFolderListingTableHeaderHtml()); + + for (FileMetadata fileMetadata : fileMetadatas) { + String localFolder = fileMetadata.getDirectoryLabel() == null ? "" : fileMetadata.getDirectoryLabel(); + + if (folderName.equals(localFolder)) { + String accessUrl = getFileAccessUrl(fileMetadata, apiLocation, originals); + sb.append(formatFileListEntryHtml(fileMetadata, accessUrl)); + sb.append("\n"); + + } else if (localFolder.startsWith(folderName)){ + String subFolder = "".equals(folderName) ? localFolder : localFolder.substring(folderName.length() + 1); + if (subFolder.indexOf('/') > 0) { + subFolder = subFolder.substring(0, subFolder.indexOf('/')); + } + String folderAccessUrl = getFolderAccessUrl(fileMetadata.getDatasetVersion(), folderName, subFolder, apiLocation, originals); + sb.append(formatFileListFolderHtml(subFolder, folderAccessUrl)); + sb.append("\n"); + } + } + + return formatTable(sb.toString()); + } + + private static String formatFolderListingTableHeaderHtml() { + + StringBuilder sb = new StringBuilder(); + sb.append(HtmlFormatUtil.formatTag("Name", HTML_TABLE_HDR)); + sb.append(HtmlFormatUtil.formatTag("Last Modified", HTML_TABLE_HDR)); + sb.append(HtmlFormatUtil.formatTag("Size", HTML_TABLE_HDR)); + sb.append(HtmlFormatUtil.formatTag("Description", HTML_TABLE_HDR)); + + String hdr = formatTableRow(sb.toString()); + + // add a separator row (again, we want it to look just like Apache index) + return hdr.concat(formatTableRow(HtmlFormatUtil.formatTag("
", HTML_TABLE_HDR,"colspan=\"4\""))); + + } + + private static String formatFileListEntryHtml(FileMetadata fileMetadata, String accessUrl) { + StringBuilder sb = new StringBuilder(); + + String fileName = fileMetadata.getLabel(); + String dateString = new SimpleDateFormat(FILE_LIST_DATE_FORMAT).format(fileMetadata.getDataFile().getCreateDate()); + String sizeString = fileMetadata.getDataFile().getFriendlySize(); + + sb.append(formatTableCell(formatLink(fileName, accessUrl))); + sb.append(formatTableCellAlignRight(dateString)); + sb.append(formatTableCellAlignRight(sizeString)); + sb.append(formatTableCellAlignRight(" ")); + + return formatTableRow(sb.toString()); + } + + private static String formatFileListFolderHtml(String folderName, String listApiUrl) { + + StringBuilder sb = new StringBuilder(); + + sb.append(formatTableCell(formatLink(folderName+"/", listApiUrl))); + sb.append(formatTableCellAlignRight(" - ")); + sb.append(formatTableCellAlignRight(" - ")); + sb.append(formatTableCellAlignRight(" ")); + + return formatTableRow(sb.toString()); + } + + private static String getFileAccessUrl(FileMetadata fileMetadata, String apiLocation, boolean original) { + String fileId = fileMetadata.getDataFile().getId().toString(); + + if (StringUtil.nonEmpty(fileMetadata.getDirectoryLabel())) { + fileId = fileMetadata.getDirectoryLabel().concat("/").concat(fileId); + } + + String formatArg = fileMetadata.getDataFile().isTabularData() && original ? "?format=original" : ""; + + return apiLocation + "/api/access/datafile/" + fileId + formatArg; + } + + private static String getFolderAccessUrl(DatasetVersion version, String currentFolder, String subFolder, String apiLocation, boolean originals) { + String datasetId = version.getDataset().getId().toString(); + String versionTag = version.getFriendlyVersionNumber(); + versionTag = versionTag.replace("DRAFT", ":draft"); + if (!"".equals(currentFolder)) { + subFolder = currentFolder + "/" + subFolder; + } + + return apiLocation + "/api/datasets/" + datasetId + + "/dirindex/?version=" + versionTag + "&" + + "folder=" + subFolder + + (originals ? "&original=true" : ""); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/xml/html/HtmlFormatUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/xml/html/HtmlFormatUtil.java new file mode 100644 index 00000000000..432cf68db53 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/util/xml/html/HtmlFormatUtil.java @@ -0,0 +1,133 @@ +/* + Copyright (C) 2005-2012, by the President and Fellows of Harvard College. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Dataverse Network - A web application to share, preserve and analyze research data. + Developed at the Institute for Quantitative Social Science, Harvard University. + Version 3.0. +*/ +package edu.harvard.iq.dataverse.util.xml.html; + +/** + * A simple utility that generates formatted HTML + * to avoid hard-coding html tags by hand. + * + * @author leonid@hmdc.harvard.edu + */ +public class HtmlFormatUtil implements java.io.Serializable { + public static final String HTML_DOCTYPE_HEADER = ""; + public static final String HTML_TAG = "html"; + public static final String HTML_HEAD = "head"; + public static final String HTML_BODY = "body"; + public static final String HTML_TITLE = "title"; + public static final String HTML_TABLE = "table"; + public static final String HTML_TABLE_ROW = "tr"; + public static final String HTML_TABLE_HDR = "th"; + public static final String HTML_TABLE_CELL = "td"; + public static final String HTML_LINK = "a"; + public static final String HTML_HREF = "href"; + public static final String HTML_H1 = "h1"; + + public static final String HTML_ALIGN_TOP = "valign=\"top\""; + public static final String HTML_ALIGN_RIGHT = "align=\"right\""; + + public static String formatTable (String tableBody) { + return formatTag(tableBody, HTML_TABLE); + } + + public static String formatTableRow (String entry) { + return formatTag(entry, HTML_TABLE_ROW); + } + + public static String formatTableCell (String entry) { + return formatTag(entry, HTML_TABLE_CELL, null); + } + + public static String formatTableCell (String entry, String attr) { + return formatTag(entry, HTML_TABLE_CELL, attr); + } + + public static String formatTableCellValignTop(String entry) { + return formatTableCell (entry, HTML_ALIGN_TOP); + } + + public static String formatTableCellAlignRight(String entry) { + return formatTableCell (entry, HTML_ALIGN_RIGHT); + } + + public static String formatLink(String name, String url) { + String href = HTML_HREF + "=\"" + url + "\""; + + return formatTag(name, HTML_LINK, href); + } + + public static String formatTitle(String title) { + return formatTag(title, HTML_TITLE); + } + + public static String formatTag(String entry, String tag) { + return formatTag(entry, tag, null); + } + + public static String formatTag(String entry, String tag, String attributes) { + StringBuilder sb = new StringBuilder(); + sb.append('<'); + sb.append(tag); + + if (attributes != null) { + sb.append(" "+attributes); + } + + sb.append('>'); + sb.append(entry); + sb.append("'); + + return sb.toString(); + } + + public static String formatTagOpen(String tag) { + StringBuilder sb = new StringBuilder(); + sb.append('<'); + sb.append(tag); + sb.append('>'); + return sb.toString(); + } + + public static String formatTagClose(String tag) { + StringBuilder sb = new StringBuilder(); + sb.append("'); + return sb.toString(); + } + + public static String formatDoc(String header, String body) { + StringBuilder sb = new StringBuilder(); + + sb.append(HTML_DOCTYPE_HEADER); + sb.append("\n"); + sb.append(formatTagOpen(HTML_TAG)); + + sb.append(formatTag(header, HTML_HEAD)); + sb.append("\n"); + sb.append(formatTag(body, HTML_BODY)); + sb.append("\n"); + sb.append(formatTagClose(HTML_TAG)); + sb.append("\n"); + return sb.toString(); + } + +} diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java index a538cb54f59..8d4369fa85b 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/FilesIT.java @@ -12,6 +12,7 @@ import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import static java.lang.Thread.sleep; +import java.math.BigDecimal; import java.text.MessageFormat; import java.util.Arrays; import java.util.Collections; @@ -1468,6 +1469,96 @@ public void test_ProcessShapeFilePackage() { .statusCode(OK.getStatusCode()); } + /* + First test for the new "crawlable file access" API (#7084) + */ + @Test + public void test_CrawlableAccessToDatasetFiles() { + msgt("test_test_CrawlableAccessToDatasetFiles"); + // Create user + String apiToken = createUserGetToken(); + + // Create Dataverse + String dataverseAlias = createDataverseGetAlias(apiToken); + + // Create Dataset + String datasetId = createDatasetGetId(dataverseAlias, apiToken).toString(); + + msgt("dataset id: "+datasetId); + + String testFileName = "dataverseproject.png"; + String pathToFile = "src/main/webapp/resources/images/" + testFileName; + String description = "test file 1"; + String folderName = "subfolder"; + + JsonObjectBuilder json = Json.createObjectBuilder() + .add("description", description) + .add("directoryLabel", folderName); + + Response addResponse = UtilIT.uploadFileViaNative(datasetId, pathToFile, json.build(), apiToken); + + msgt("Server response: " + addResponse.prettyPrint()); + + addResponse.then().assertThat() + .body("status", equalTo(AbstractApiBean.STATUS_OK)) + .body("data.files[0].label", equalTo(testFileName)) + .body("data.files[0].directoryLabel", equalTo(folderName)) + .body("data.files[0].description", equalTo(description)) + .statusCode(OK.getStatusCode()); + + String dataFileId = addResponse.getBody().jsonPath().getString("data.files[0].dataFile.id"); + //msgt("datafile id: "+dataFileId); + + // TODO: (potentially?) + // maybe upload a few more files, in more folders, + // and try an actual recursive crawl of a full tree - ? + + // Make some calls to the "/dirindex API: + // (note that this API outputs HTML!) + + // Expected values in the output: + String expectedTitleTopFolder = "Index of folder /"; + String expectedLinkTopFolder = folderName + "/"; + String expectedLinkAhrefTopFolder = "/api/datasets/"+datasetId+"/dirindex/?version=:draft&folder=subfolder"; + + String expectedTitleSubFolder = "Index of folder /" + folderName; + String expectedLinkAhrefSubFolder = "/api/access/datafile/" + folderName + "/" + dataFileId; + + // ... with no folder specified: + // (with just the one file above, this should show one folder only - "subfolder", and no files) + Response fileAccessResponse = UtilIT.getCrawlableFileAccess(datasetId, "", apiToken); + fileAccessResponse.then().assertThat().statusCode(OK.getStatusCode()).contentType("text/html"); + + String htmlTitle = fileAccessResponse.getBody().htmlPath().getString("html.head.title"); + assertEquals(expectedTitleTopFolder, htmlTitle); + + String htmlCrawlLink = fileAccessResponse.getBody().htmlPath().getString("html.body.table.tr[2].td[0]"); + //msgt("html crawl link: "+htmlCrawlLink); + assertEquals(expectedLinkTopFolder, htmlCrawlLink); + + String htmlCrawlLinkAhref = fileAccessResponse.getBody().htmlPath().get("html.body.table.tr[2].td[0].a.@href").toString(); + //msgt("html crawl link href: "+htmlCrawlLinkAhref); + assertEquals(expectedLinkAhrefTopFolder, htmlCrawlLinkAhref); + + + // ... and with the folder name "subfolder" specified: + // (should result in being shown one access link to the file above, no folders) + fileAccessResponse = UtilIT.getCrawlableFileAccess(datasetId.toString(), folderName, apiToken); + fileAccessResponse.then().assertThat().statusCode(OK.getStatusCode()).contentType("text/html"); + + htmlTitle = fileAccessResponse.getBody().htmlPath().getString("html.head.title"); + assertEquals(expectedTitleSubFolder, htmlTitle); + + htmlCrawlLink = fileAccessResponse.getBody().htmlPath().getString("html.body.table.tr[2].td[0]"); + //msgt("html crawl link: "+htmlCrawlLink); + // this should be the name of the test file above: + assertEquals(testFileName, htmlCrawlLink); + + htmlCrawlLinkAhref = fileAccessResponse.getBody().htmlPath().get("html.body.table.tr[2].td[0].a.@href").toString(); + //msgt("html crawl link href: "+htmlCrawlLinkAhref); + assertEquals(expectedLinkAhrefSubFolder, htmlCrawlLinkAhref); + + } private void msg(String m){ System.out.println(m); diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index 27b34aba5e1..2b6bbaf0caa 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -36,6 +36,7 @@ import org.hamcrest.Matcher; import static com.jayway.restassured.path.xml.XmlPath.from; import static com.jayway.restassured.RestAssured.given; +import edu.harvard.iq.dataverse.util.StringUtil; import java.io.StringReader; import javax.json.JsonArray; import static org.junit.Assert.assertEquals; @@ -641,6 +642,16 @@ static Response uploadFileViaNative(String datasetId, String pathToFile, String } return requestSpecification.post("/api/datasets/" + datasetId + "/add"); } + + static Response getCrawlableFileAccess(String datasetId, String folderName, String apiToken) { + RequestSpecification requestSpecification = given() + .header(API_TOKEN_HTTP_HEADER, apiToken); + String apiPath = "/api/datasets/" + datasetId + "/dirindex?version=:draft"; + if (StringUtil.nonEmpty(folderName)) { + apiPath = apiPath.concat("&folder="+folderName); + } + return requestSpecification.get(apiPath); + } static Response replaceFile(String fileIdOrPersistentId, String pathToFile, String apiToken) { String jsonAsString = null;