-
Notifications
You must be signed in to change notification settings - Fork 9.2k
HADOOP-19572. [ABFS][BugFix] Empty Page Issue on Subsequent ListBlob call with NextMarker #7698
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dfe1e22
67a71bd
5e36e56
cc96f4d
8f27ac1
419f16b
e18350c
16e1e58
825538a
ee48501
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -51,6 +51,7 @@ | |
|
|
||
| import org.apache.commons.io.IOUtils; | ||
| import org.apache.commons.lang3.StringUtils; | ||
| import org.apache.hadoop.fs.FileStatus; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.classification.VisibleForTesting; | ||
| import org.apache.hadoop.fs.Path; | ||
|
|
@@ -77,6 +78,7 @@ | |
| import org.apache.hadoop.fs.azurebfs.extensions.SASTokenProvider; | ||
| import org.apache.hadoop.fs.azurebfs.oauth2.AccessTokenProvider; | ||
| import org.apache.hadoop.fs.azurebfs.security.ContextEncryptionAdapter; | ||
| import org.apache.hadoop.fs.azurebfs.utils.ListUtils; | ||
| import org.apache.hadoop.fs.azurebfs.utils.TracingContext; | ||
|
|
||
| import static java.net.HttpURLConnection.HTTP_CONFLICT; | ||
|
|
@@ -348,13 +350,9 @@ public AbfsRestOperation deleteFilesystem(TracingContext tracingContext) | |
| */ | ||
| @Override | ||
| public ListResponseData listPath(final String relativePath, final boolean recursive, | ||
| final int listMaxResults, final String continuation, TracingContext tracingContext, URI uri) throws IOException { | ||
| return listPath(relativePath, recursive, listMaxResults, continuation, tracingContext, uri, true); | ||
| } | ||
|
|
||
| public ListResponseData listPath(final String relativePath, final boolean recursive, | ||
| final int listMaxResults, final String continuation, TracingContext tracingContext, URI uri, boolean is404CheckRequired) | ||
| final int listMaxResults, final String continuation, TracingContext tracingContext, URI uri) | ||
| throws AzureBlobFileSystemException { | ||
|
|
||
| final List<AbfsHttpHeader> requestHeaders = createDefaultHeaders(); | ||
|
|
||
| AbfsUriQueryBuilder abfsUriQueryBuilder = createDefaultUriQueryBuilder(); | ||
|
|
@@ -400,37 +398,46 @@ public ListResponseData listPath(final String relativePath, final boolean recurs | |
| listResponseData.setOp(retryListOp); | ||
| } | ||
| } | ||
| return listResponseData; | ||
| } | ||
|
|
||
| if (isEmptyListResults(listResponseData) && is404CheckRequired) { | ||
| /** | ||
| * Post-processing of the list operation on Blob endpoint. | ||
| * There are two client handing to be done on list output. | ||
| * 1. Empty List returned on server could potentially mean path is a file. | ||
| * 2. There can be duplicates returned from the server for explicit non-empty directory. | ||
| * @param relativePath relative path to be listed. | ||
| * @param fileStatuses list of file statuses returned from the server. | ||
| * @param tracingContext tracing context to trace server calls. | ||
| * @param uri URI to be used for path conversion. | ||
| * @return rectified list of file statuses. | ||
| * @throws AzureBlobFileSystemException if any failure occurs. | ||
| */ | ||
| @Override | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Java doc for this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Taken |
||
| public List<FileStatus> postListProcessing(String relativePath, List<FileStatus> fileStatuses, | ||
| TracingContext tracingContext, URI uri) throws AzureBlobFileSystemException { | ||
| List<FileStatus> rectifiedFileStatuses = new ArrayList<>(); | ||
| if (fileStatuses.isEmpty() && !ROOT_PATH.equals(relativePath)) { | ||
| // If the list operation returns no paths, we need to check if the path is a file. | ||
| // If it is a file, we need to return the file in the list. | ||
| // If it is a directory or root path, we need to return an empty list. | ||
| // If it is a non-existing path, we need to throw a FileNotFoundException. | ||
| if (relativePath.equals(ROOT_PATH)) { | ||
| // Root Always exists as directory. It can be an empty listing. | ||
| return listResponseData; | ||
| } | ||
| AbfsRestOperation pathStatus = this.getPathStatus(relativePath, tracingContext, null, false); | ||
| BlobListResultSchema listResultSchema = getListResultSchemaFromPathStatus(relativePath, pathStatus); | ||
| LOG.debug("ListBlob attempted on a file path. Returning file status."); | ||
| List<VersionedFileStatus> fileStatusList = new ArrayList<>(); | ||
| LOG.debug("ListStatus attempted on a file path {}. Returning file status.", relativePath); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correction in log message
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is fine as originally ListStatus was attempted on file path. |
||
| for (BlobListResultEntrySchema entry : listResultSchema.paths()) { | ||
| fileStatusList.add(getVersionedFileStatusFromEntry(entry, uri)); | ||
| rectifiedFileStatuses.add(getVersionedFileStatusFromEntry(entry, uri)); | ||
| } | ||
| AbfsRestOperation listOp = getAbfsRestOperation( | ||
| AbfsRestOperationType.ListBlobs, | ||
| HTTP_METHOD_GET, | ||
| url, | ||
| requestHeaders); | ||
| listOp.hardSetGetListStatusResult(HTTP_OK, listResultSchema); | ||
| listResponseData.setFileStatusList(fileStatusList); | ||
| listResponseData.setContinuationToken(null); | ||
| listResponseData.setRenamePendingJsonPaths(null); | ||
| listResponseData.setOp(listOp); | ||
| return listResponseData; | ||
| } else { | ||
| // Remove duplicates from the non-empty list output only. | ||
| rectifiedFileStatuses.addAll(ListUtils.getUniqueListResult(fileStatuses)); | ||
| LOG.debug( | ||
| "ListBlob API returned a total of {} elements including duplicates." | ||
| + "Number of unique Elements are {}", fileStatuses.size(), | ||
| rectifiedFileStatuses.size()); | ||
| } | ||
| return listResponseData; | ||
| return rectifiedFileStatuses; | ||
| } | ||
|
|
||
| /** | ||
| * Filter the paths for which no rename redo operation is performed. | ||
| * Update BlobListResultSchema path with filtered entries. | ||
|
|
@@ -2013,6 +2020,8 @@ private static String decodeMetadataAttribute(String encoded) | |
|
|
||
| /** | ||
| * Checks if the listing of the specified path is non-empty. | ||
| * Since listing is incomplete as long as continuation token is returned by server, | ||
| * we need to iterate until either we get one entry or continuation token becomes null. | ||
| * | ||
| * @param path The path to be listed. | ||
| * @param tracingContext The tracing context for tracking the operation. | ||
|
|
@@ -2024,26 +2033,15 @@ public boolean isNonEmptyDirectory(String path, | |
| TracingContext tracingContext) throws AzureBlobFileSystemException { | ||
| // This method is only called internally to determine state of a path | ||
| // and hence don't need identity transformation to happen. | ||
| ListResponseData listResponseData = listPath(path, false, 1, null, tracingContext, null, false); | ||
| return !isEmptyListResults(listResponseData); | ||
| } | ||
|
|
||
| /** | ||
| * Check if the list call returned empty results without any continuation token. | ||
| * @param listResponseData The response of listing API from the server. | ||
| * @return True if empty results without continuation token. | ||
| */ | ||
| private boolean isEmptyListResults(ListResponseData listResponseData) { | ||
| AbfsHttpOperation result = listResponseData.getOp().getResult(); | ||
| boolean isEmptyList = result != null && result.getStatusCode() == HTTP_OK && // List Call was successful | ||
| result.getListResultSchema() != null && // Parsing of list response was successful | ||
| listResponseData.getFileStatusList().isEmpty() && listResponseData.getRenamePendingJsonPaths().isEmpty() &&// No paths were returned | ||
| StringUtils.isEmpty(listResponseData.getContinuationToken()); // No continuation token was returned | ||
| if (isEmptyList) { | ||
| LOG.debug("List call returned empty results without any continuation token."); | ||
| return true; | ||
| } | ||
| return false; | ||
| String continuationToken = null; | ||
| List<FileStatus> fileStatusList = new ArrayList<>(); | ||
| // We need to loop on continuation token until we get an entry or continuation token becomes null. | ||
| do { | ||
| ListResponseData listResponseData = listPath(path, false, 1, null, tracingContext, null); | ||
| fileStatusList.addAll(listResponseData.getFileStatusList()); | ||
| continuationToken = listResponseData.getContinuationToken(); | ||
| } while (StringUtils.isNotEmpty(continuationToken) && fileStatusList.isEmpty()); | ||
| return !fileStatusList.isEmpty(); | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,7 @@ | |
| import com.fasterxml.jackson.databind.ObjectMapper; | ||
|
|
||
| import org.apache.hadoop.classification.VisibleForTesting; | ||
| import org.apache.hadoop.fs.FileStatus; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.hadoop.fs.FileAlreadyExistsException; | ||
|
|
@@ -348,6 +349,21 @@ public ListResponseData listPath(final String relativePath, | |
| return listResponseData; | ||
| } | ||
|
|
||
| /** | ||
| * Non-functional implementation. | ||
| * Client side handling to remove duplicates not needed in DFSClient. | ||
| * @param relativePath on which listing was attempted. | ||
| * @param fileStatuses result of listing operation. | ||
| * @param tracingContext for tracing the server calls. | ||
| * @param uri to be used for path conversion. | ||
| * @return fileStatuses as it is without any processing. | ||
| */ | ||
| @Override | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add javadocs
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Taken |
||
| public List<FileStatus> postListProcessing(String relativePath, | ||
| List<FileStatus> fileStatuses, TracingContext tracingContext, URI uri){ | ||
| return fileStatuses; | ||
| } | ||
|
|
||
| /** | ||
| * Get Rest Operation for API | ||
| * <a href="https://learn.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/path/create"> | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The abstract listPath method in AbfsClient throws IOException, we should keep that exception same here as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AzureBlobFileSystemException is IOException type only