5151
5252import org .apache .commons .io .IOUtils ;
5353import org .apache .commons .lang3 .StringUtils ;
54+ import org .apache .hadoop .fs .FileStatus ;
5455import org .apache .hadoop .fs .FileSystem ;
5556import org .apache .hadoop .classification .VisibleForTesting ;
5657import org .apache .hadoop .fs .Path ;
7778import org .apache .hadoop .fs .azurebfs .extensions .SASTokenProvider ;
7879import org .apache .hadoop .fs .azurebfs .oauth2 .AccessTokenProvider ;
7980import org .apache .hadoop .fs .azurebfs .security .ContextEncryptionAdapter ;
81+ import org .apache .hadoop .fs .azurebfs .utils .ListUtils ;
8082import org .apache .hadoop .fs .azurebfs .utils .TracingContext ;
8183
8284import static java .net .HttpURLConnection .HTTP_CONFLICT ;
@@ -348,13 +350,9 @@ public AbfsRestOperation deleteFilesystem(TracingContext tracingContext)
348350 */
349351 @ Override
350352 public ListResponseData listPath (final String relativePath , final boolean recursive ,
351- final int listMaxResults , final String continuation , TracingContext tracingContext , URI uri ) throws IOException {
352- return listPath (relativePath , recursive , listMaxResults , continuation , tracingContext , uri , true );
353- }
354-
355- public ListResponseData listPath (final String relativePath , final boolean recursive ,
356- final int listMaxResults , final String continuation , TracingContext tracingContext , URI uri , boolean is404CheckRequired )
353+ final int listMaxResults , final String continuation , TracingContext tracingContext , URI uri )
357354 throws AzureBlobFileSystemException {
355+
358356 final List <AbfsHttpHeader > requestHeaders = createDefaultHeaders ();
359357
360358 AbfsUriQueryBuilder abfsUriQueryBuilder = createDefaultUriQueryBuilder ();
@@ -400,37 +398,46 @@ public ListResponseData listPath(final String relativePath, final boolean recurs
400398 listResponseData .setOp (retryListOp );
401399 }
402400 }
401+ return listResponseData ;
402+ }
403403
404- if (isEmptyListResults (listResponseData ) && is404CheckRequired ) {
404+ /**
405+ * Post-processing of the list operation on Blob endpoint.
406+ * There are two client handing to be done on list output.
407+ * 1. Empty List returned on server could potentially mean path is a file.
408+ * 2. There can be duplicates returned from the server for explicit non-empty directory.
409+ * @param relativePath relative path to be listed.
410+ * @param fileStatuses list of file statuses returned from the server.
411+ * @param tracingContext tracing context to trace server calls.
412+ * @param uri URI to be used for path conversion.
413+ * @return rectified list of file statuses.
414+ * @throws AzureBlobFileSystemException if any failure occurs.
415+ */
416+ @ Override
417+ public List <FileStatus > postListProcessing (String relativePath , List <FileStatus > fileStatuses ,
418+ TracingContext tracingContext , URI uri ) throws AzureBlobFileSystemException {
419+ List <FileStatus > rectifiedFileStatuses = new ArrayList <>();
420+ if (fileStatuses .isEmpty () && !ROOT_PATH .equals (relativePath )) {
405421 // If the list operation returns no paths, we need to check if the path is a file.
406422 // If it is a file, we need to return the file in the list.
423+ // If it is a directory or root path, we need to return an empty list.
407424 // If it is a non-existing path, we need to throw a FileNotFoundException.
408- if (relativePath .equals (ROOT_PATH )) {
409- // Root Always exists as directory. It can be an empty listing.
410- return listResponseData ;
411- }
412425 AbfsRestOperation pathStatus = this .getPathStatus (relativePath , tracingContext , null , false );
413426 BlobListResultSchema listResultSchema = getListResultSchemaFromPathStatus (relativePath , pathStatus );
414- LOG .debug ("ListBlob attempted on a file path. Returning file status." );
415- List <VersionedFileStatus > fileStatusList = new ArrayList <>();
427+ LOG .debug ("ListStatus attempted on a file path {}. Returning file status." , relativePath );
416428 for (BlobListResultEntrySchema entry : listResultSchema .paths ()) {
417- fileStatusList .add (getVersionedFileStatusFromEntry (entry , uri ));
429+ rectifiedFileStatuses .add (getVersionedFileStatusFromEntry (entry , uri ));
418430 }
419- AbfsRestOperation listOp = getAbfsRestOperation (
420- AbfsRestOperationType .ListBlobs ,
421- HTTP_METHOD_GET ,
422- url ,
423- requestHeaders );
424- listOp .hardSetGetListStatusResult (HTTP_OK , listResultSchema );
425- listResponseData .setFileStatusList (fileStatusList );
426- listResponseData .setContinuationToken (null );
427- listResponseData .setRenamePendingJsonPaths (null );
428- listResponseData .setOp (listOp );
429- return listResponseData ;
431+ } else {
432+ // Remove duplicates from the non-empty list output only.
433+ rectifiedFileStatuses .addAll (ListUtils .getUniqueListResult (fileStatuses ));
434+ LOG .debug (
435+ "ListBlob API returned a total of {} elements including duplicates."
436+ + "Number of unique Elements are {}" , fileStatuses .size (),
437+ rectifiedFileStatuses .size ());
430438 }
431- return listResponseData ;
439+ return rectifiedFileStatuses ;
432440 }
433-
434441 /**
435442 * Filter the paths for which no rename redo operation is performed.
436443 * Update BlobListResultSchema path with filtered entries.
@@ -2013,6 +2020,8 @@ private static String decodeMetadataAttribute(String encoded)
20132020
20142021 /**
20152022 * Checks if the listing of the specified path is non-empty.
2023+ * Since listing is incomplete as long as continuation token is returned by server,
2024+ * we need to iterate until either we get one entry or continuation token becomes null.
20162025 *
20172026 * @param path The path to be listed.
20182027 * @param tracingContext The tracing context for tracking the operation.
@@ -2024,26 +2033,15 @@ public boolean isNonEmptyDirectory(String path,
20242033 TracingContext tracingContext ) throws AzureBlobFileSystemException {
20252034 // This method is only called internally to determine state of a path
20262035 // and hence don't need identity transformation to happen.
2027- ListResponseData listResponseData = listPath (path , false , 1 , null , tracingContext , null , false );
2028- return !isEmptyListResults (listResponseData );
2029- }
2030-
2031- /**
2032- * Check if the list call returned empty results without any continuation token.
2033- * @param listResponseData The response of listing API from the server.
2034- * @return True if empty results without continuation token.
2035- */
2036- private boolean isEmptyListResults (ListResponseData listResponseData ) {
2037- AbfsHttpOperation result = listResponseData .getOp ().getResult ();
2038- boolean isEmptyList = result != null && result .getStatusCode () == HTTP_OK && // List Call was successful
2039- result .getListResultSchema () != null && // Parsing of list response was successful
2040- listResponseData .getFileStatusList ().isEmpty () && listResponseData .getRenamePendingJsonPaths ().isEmpty () &&// No paths were returned
2041- StringUtils .isEmpty (listResponseData .getContinuationToken ()); // No continuation token was returned
2042- if (isEmptyList ) {
2043- LOG .debug ("List call returned empty results without any continuation token." );
2044- return true ;
2045- }
2046- return false ;
2036+ String continuationToken = null ;
2037+ List <FileStatus > fileStatusList = new ArrayList <>();
2038+ // We need to loop on continuation token until we get an entry or continuation token becomes null.
2039+ do {
2040+ ListResponseData listResponseData = listPath (path , false , 1 , null , tracingContext , null );
2041+ fileStatusList .addAll (listResponseData .getFileStatusList ());
2042+ continuationToken = listResponseData .getContinuationToken ();
2043+ } while (StringUtils .isNotEmpty (continuationToken ) && fileStatusList .isEmpty ());
2044+ return !fileStatusList .isEmpty ();
20472045 }
20482046
20492047 /**
0 commit comments