From 741397c04e9bd7883911f8b55f27005584f29364 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 8 Apr 2024 18:40:33 +0900 Subject: [PATCH 1/5] GH-41034: [C++][FS][Azure] Adjust behavior against Azure for generic filesystem tests They are failing: ```text [ FAILED ] 5 tests, listed below: [ FAILED ] TestAzureHierarchicalNSGeneric.DeleteDir [ FAILED ] TestAzureHierarchicalNSGeneric.DeleteDirContents [ FAILED ] TestAzureHierarchicalNSGeneric.CopyFile [ FAILED ] TestAzureHierarchicalNSGeneric.GetFileInfoSelector [ FAILED ] TestAzureHierarchicalNSGeneric.SpecialChars ``` --- cpp/src/arrow/filesystem/azurefs.cc | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 84733a824e7ba..d2e2fb857ed72 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2157,6 +2157,17 @@ class AzureFileSystem::Impl { Azure::Nullable lease_id = {}) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); + ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(adlfs_client, location)); + if (file_info.type() == FileType::NotFound) { + if (require_dir_to_exist) { + return PathNotFound(location); + } else { + return Status::OK(); + } + } + if (file_info.type() != FileType::Directory) { + return NotADir(location); + } auto directory_client = adlfs_client.GetDirectoryClient( std::string(internal::RemoveTrailingSlash(location.path))); DataLake::DeleteDirectoryOptions options; @@ -2168,13 +2179,6 @@ class AzureFileSystem::Impl { // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); } catch (const Storage::StorageException& exception) { - if (exception.ErrorCode == "FilesystemNotFound" || - exception.ErrorCode == "PathNotFound") { - if (require_dir_to_exist) { - return PathNotFound(location); - } - return Status::OK(); - } return ExceptionToStatus(exception, "Failed to delete a directory: ", location.path, ": ", directory_client.GetUrl()); } @@ -2200,6 +2204,9 @@ class AzureFileSystem::Impl { kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); } } else { + if (path.Name == location.path) { + return NotADir(location); + } auto sub_file_client = adlfs_client.GetFileClient(path.Name); try { sub_file_client.Delete(); From a6758b1c258857d8e1d72e27969d8e3d0c6af4cd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 9 Apr 2024 16:06:24 +0900 Subject: [PATCH 2/5] GetFileInfoSelector * Always support directory mtime with HNS * Add directory check for selector --- cpp/src/arrow/filesystem/azurefs.cc | 28 ++++++++++++++++++++++-- cpp/src/arrow/filesystem/azurefs_test.cc | 6 ++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d2e2fb857ed72..df6db3d959f3a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1642,11 +1642,27 @@ class AzureFileSystem::Impl { options.Prefix = {}; found = true; // Unless the container itself is not found later! } else { - options.Prefix = internal::EnsureTrailingSlash(base_location.path); + ARROW_ASSIGN_OR_RAISE( + auto prefix, AzureLocation::FromString( + std::string(internal::EnsureTrailingSlash(select.base_dir)))); + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(container_client, prefix)); + if (info.type() == FileType::NotFound) { + if (select.allow_not_found) { + return Status::OK(); + } else { + return PathNotFound(base_location); + } + } else if (info.type() != FileType::Directory) { + return NotADir(base_location); + } + options.Prefix = prefix.path; } options.PageSizeHint = page_size_hint; options.Include = Blobs::Models::ListBlobsIncludeFlags::Metadata; + auto adlfs_client = GetFileSystemClient(base_location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); + auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { if (select.recursive && select.max_recursion > 0) { FileSelector sub_select; @@ -1671,7 +1687,15 @@ class AzureFileSystem::Impl { }; auto process_prefix = [&](const std::string& prefix) noexcept -> Status { const auto path = internal::ConcatAbstractPath(base_location.container, prefix); - acc_results->push_back(DirectoryFileInfoFromPath(path)); + if (hns_support == HNSSupport::kEnabled) { + ARROW_ASSIGN_OR_RAISE( + auto location, + AzureLocation::FromString(std::string(internal::RemoveTrailingSlash(path)))); + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(adlfs_client, location)); + acc_results->push_back(std::move(info)); + } else { + acc_results->push_back(DirectoryFileInfoFromPath(path)); + } return recurse(prefix); }; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 24031e313f798..ed09bfc2fadd7 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -392,7 +392,7 @@ class TestGeneric : public ::testing::Test, public GenericFileSystemTest { bool allow_move_dir() const override { return false; } bool allow_move_file() const override { return true; } bool allow_append_to_file() const override { return true; } - bool have_directory_mtimes() const override { return false; } + bool have_directory_mtimes() const override { return true; } bool have_flaky_directory_tree_deletion() const override { return false; } bool have_file_metadata() const override { return true; } // calloc() used in libxml2's xmlNewGlobalState() is detected as a @@ -429,6 +429,8 @@ class TestAzuriteGeneric : public TestGeneric { protected: // Azurite doesn't support moving files over containers. bool allow_move_file() const override { return false; } + // Azurite doesn't support directory mtime. + bool have_directory_mtimes() const override { return false; } // DeleteDir() doesn't work with Azurite on macOS bool have_flaky_directory_tree_deletion() const override { return env_->HasSubmitBatchBug(); @@ -449,6 +451,8 @@ class TestAzureFlatNSGeneric : public TestGeneric { protected: // Flat namespace account doesn't support moving files over containers. bool allow_move_file() const override { return false; } + // Flat namespace account doesn't support directory mtime. + bool have_directory_mtimes() const override { return false; } }; class TestAzureHierarchicalNSGeneric : public TestGeneric { From 707712bb042636f74cec44c47274bf1e1a6c4f80 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 10 Apr 2024 06:14:12 +0900 Subject: [PATCH 3/5] Revert removing error check --- cpp/src/arrow/filesystem/azurefs.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index df6db3d959f3a..4f3e20678c964 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2203,6 +2203,13 @@ class AzureFileSystem::Impl { // All the others either succeed or throw an exception. DCHECK(response.Value.Deleted); } catch (const Storage::StorageException& exception) { + if (exception.ErrorCode == "FilesystemNotFound" || + exception.ErrorCode == "PathNotFound") { + if (require_dir_to_exist) { + return PathNotFound(location); + } + return Status::OK(); + } return ExceptionToStatus(exception, "Failed to delete a directory: ", location.path, ": ", directory_client.GetUrl()); } From 3ac1198fe59cb858b37d5d1ee671c81e24d3d410 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 10 Apr 2024 06:14:50 +0900 Subject: [PATCH 4/5] Use lease_id for GetFileInfo() --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 4f3e20678c964..67232b3156c1a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -2181,7 +2181,7 @@ class AzureFileSystem::Impl { Azure::Nullable lease_id = {}) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); - ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(adlfs_client, location)); + ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(adlfs_client, location, lease_id)); if (file_info.type() == FileType::NotFound) { if (require_dir_to_exist) { return PathNotFound(location); From ff8d266f25c14ef641c51c180b1b1baea3e426f3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 10 Apr 2024 13:25:03 +0900 Subject: [PATCH 5/5] Add GetFileInfoWithSelectorFromFileSystem() --- cpp/src/arrow/filesystem/azurefs.cc | 97 +++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 67232b3156c1a..bb8309a247402 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1179,6 +1179,15 @@ Status CreateContainerIfNotExists(const std::string& container_name, } } +FileInfo FileInfoFromPath(std::string_view container, + const DataLake::Models::PathItem& path) { + FileInfo info{internal::ConcatAbstractPath(container, path.Name), + path.IsDirectory ? FileType::Directory : FileType::File}; + info.set_size(path.FileSize); + info.set_mtime(std::chrono::system_clock::time_point{path.LastModified}); + return info; +} + FileInfo DirectoryFileInfoFromPath(std::string_view path) { return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, FileType::Directory}; } @@ -1576,7 +1585,7 @@ class AzureFileSystem::Impl { } private: - /// \pref location.container is not empty. + /// \pre location.container is not empty. template Status CheckDirExists(const ContainerClient& container_client, const AzureLocation& location) { @@ -1623,6 +1632,50 @@ class AzureFileSystem::Impl { return result; } + /// \brief List the paths at the root of a filesystem or some dir in a filesystem. + /// + /// \pre adlfs_client is the client for the filesystem named like the first + /// segment of select.base_dir. + Status GetFileInfoWithSelectorFromFileSystem( + const DataLake::DataLakeFileSystemClient& adlfs_client, + const Core::Context& context, Azure::Nullable page_size_hint, + const FileSelector& select, FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + auto directory_client = adlfs_client.GetDirectoryClient(base_location.path); + bool found = false; + DataLake::ListPathsOptions options; + options.PageSizeHint = page_size_hint; + + try { + auto list_response = directory_client.ListPaths(select.recursive, options, context); + for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { + if (list_response.Paths.empty()) { + continue; + } + found = true; + for (const auto& path : list_response.Paths) { + if (path.Name == base_location.path && !path.IsDirectory) { + return NotADir(base_location); + } + acc_results->push_back(FileInfoFromPath(base_location.container, path)); + } + } + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception) || exception.ErrorCode == "PathNotFound") { + found = false; + } else { + return ExceptionToStatus(exception, + "Failed to list paths in a directory: ", select.base_dir, + ": ", directory_client.GetUrl()); + } + } + + return found || select.allow_not_found + ? Status::OK() + : ::arrow::fs::internal::PathNotFound(select.base_dir); + } + /// \brief List the blobs at the root of a container or some dir in a container. /// /// \pre container_client is the client for the container named like the first @@ -1642,27 +1695,11 @@ class AzureFileSystem::Impl { options.Prefix = {}; found = true; // Unless the container itself is not found later! } else { - ARROW_ASSIGN_OR_RAISE( - auto prefix, AzureLocation::FromString( - std::string(internal::EnsureTrailingSlash(select.base_dir)))); - ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(container_client, prefix)); - if (info.type() == FileType::NotFound) { - if (select.allow_not_found) { - return Status::OK(); - } else { - return PathNotFound(base_location); - } - } else if (info.type() != FileType::Directory) { - return NotADir(base_location); - } - options.Prefix = prefix.path; + options.Prefix = internal::EnsureTrailingSlash(base_location.path); } options.PageSizeHint = page_size_hint; options.Include = Blobs::Models::ListBlobsIncludeFlags::Metadata; - auto adlfs_client = GetFileSystemClient(base_location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { if (select.recursive && select.max_recursion > 0) { FileSelector sub_select; @@ -1687,15 +1724,7 @@ class AzureFileSystem::Impl { }; auto process_prefix = [&](const std::string& prefix) noexcept -> Status { const auto path = internal::ConcatAbstractPath(base_location.container, prefix); - if (hns_support == HNSSupport::kEnabled) { - ARROW_ASSIGN_OR_RAISE( - auto location, - AzureLocation::FromString(std::string(internal::RemoveTrailingSlash(path)))); - ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(adlfs_client, location)); - acc_results->push_back(std::move(info)); - } else { - acc_results->push_back(DirectoryFileInfoFromPath(path)); - } + acc_results->push_back(DirectoryFileInfoFromPath(path)); return recurse(prefix); }; @@ -1796,6 +1825,20 @@ class AzureFileSystem::Impl { return VisitContainers(context, std::move(on_container)); } + auto adlfs_client = GetFileSystemClient(base_location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + if (select.allow_not_found) { + return Status::OK(); + } else { + return ::arrow::fs::internal::PathNotFound(select.base_dir); + } + } + if (hns_support == HNSSupport::kEnabled) { + return GetFileInfoWithSelectorFromFileSystem(adlfs_client, context, page_size_hint, + select, acc_results); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); auto container_client = blob_service_client_->GetBlobContainerClient(base_location.container); return GetFileInfoWithSelectorFromContainer(container_client, context, page_size_hint,